def _make_mdx_index(self, db_name): if os.path.exists(db_name): os.remove(db_name) mdx = MDX(self._mdx_file) self._mdx_db = db_name returned_index = mdx.get_index(check_block=self._check) index_list = returned_index['index_dict_list'] conn = sqlite3.connect(db_name) c = conn.cursor() c.execute(''' CREATE TABLE MDX_INDEX (key_text text not null, file_pos integer, compressed_size integer, decompressed_size integer, record_block_type integer, record_start integer, record_end integer, offset integer )''') tuple_list = [ (item['key_text'], item['file_pos'], item['compressed_size'], item['decompressed_size'], item['record_block_type'], item['record_start'], item['record_end'], item['offset']) for item in index_list ] c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', tuple_list) # build the metadata table meta = returned_index['meta'] c.execute('''CREATE TABLE META (key text, value text )''') #for k,v in meta: # c.execute( # 'INSERT INTO META VALUES (?,?)', # (k, v) # ) c.executemany('INSERT INTO META VALUES (?,?)', [('encoding', meta['encoding']), ('stylesheet', meta['stylesheet']), ('title', meta['title']), ('description', meta['description']), ('version', version)]) if self._sql_index: c.execute(''' CREATE INDEX key_index ON MDX_INDEX (key_text) ''') conn.commit() conn.close() #set class member self._encoding = meta['encoding'] self._stylesheet = json.loads(meta['stylesheet']) self._title = meta['title'] self._description = meta['description']
def _make_mdx_index(self, db_name): mdx = MDX(self._mdx_file) self._mdx_db = db_name index_list = (mdx.get_index())['index_dict_list'] conn = sqlite3.connect(db_name) c = conn.cursor() c.execute(''' CREATE TABLE MDX_INDEX (key_text text, file_pos integer, compressed_size integer, record_block_type integer, record_start integer, record_end integer, offset integer )''') tuple_list = [] for item in index_list: tuple = (item['key_text'], item['file_pos'], item['compressed_size'], item['record_block_type'], item['record_start'], item['record_end'], item['offset']) tuple_list.append(tuple) c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?)', tuple_list) # build the metadata table meta = (mdx.get_index())['meta'] c.execute('''CREATE TABLE META (key text, value text )''') #for k,v in meta: # c.execute( # 'INSERT INTO META VALUES (?,?)', # (k, v) # ) c.executemany('INSERT INTO META VALUES (?,?)', [('encoding', meta['encoding']), ('stylesheet', meta['stylesheet']), ('title', meta['title']), ('description', meta['description'])]) conn.commit() conn.close() #set class member self._encoding = meta['encoding'] self._stylesheet = json.loads(meta['stylesheet']) self._title = meta['title'] self._description = meta['description']
def _(): self.header_build_flag = True mdx = MDX(self._mdx_file, only_header=True) self._encoding = mdx.meta['encoding'] self._stylesheet = json.loads(mdx.meta['stylesheet']) self._title = mdx.meta['title'] self._description = mdx.meta['description']
def _make_mdx_index(self): if os.path.exists(self._mdx_db): os.remove(self._mdx_db) mdx = MDX(self._mdx_file, only_header=False) index_list = mdx.get_index(check_block=self._check) conn = sqlite3.connect(self._mdx_db) c = conn.cursor() c.execute(''' CREATE TABLE MDX_INDEX (key_text text not null, file_pos integer, compressed_size integer, decompressed_size integer, record_block_type integer, record_start integer, record_end integer, offset integer )''') tuple_list = [ (item['key_text'], item['file_pos'], item['compressed_size'], item['decompressed_size'], item['record_block_type'], item['record_start'], item['record_end'], item['offset']) for item in index_list ] c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', tuple_list) # build the metadata table c.execute('''CREATE TABLE META (key text, value text )''') c.executemany('INSERT INTO META VALUES (?,?)', [('encoding', self.meta['encoding']), ('stylesheet', json.dumps(self.meta['stylesheet'])), ('title', self.meta['title']), ('description', self.meta['description']), ('version', version)]) if self._sql_index: c.execute(''' CREATE INDEX key_index ON MDX_INDEX (key_text) ''') conn.commit() conn.close()
def make_sqlite(self): sqlite_file = self._mdx_file + '.sqlite.db' if os.path.exists(sqlite_file): os.remove(sqlite_file) mdx = MDX(self._mdx_file) conn = sqlite3.connect(sqlite_file) cursor = conn.cursor() cursor.execute(''' CREATE TABLE MDX_DICT (key text not null, value text )''') # remove '(pīnyīn)', remove `1`: aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ' pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?" % (aeiou, aeiou, aeiou) tuple_list = [(key.decode(), re.sub(pattern, '', value.decode())) for key, value in mdx.items()] cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list) returned_index = mdx.get_index(check_block=self._check) meta = returned_index['meta'] cursor.execute('''CREATE TABLE META (key text, value text)''') cursor.executemany('INSERT INTO META VALUES (?,?)', [('encoding', meta['encoding']), ('stylesheet', meta['stylesheet']), ('title', meta['title']), ('description', meta['description']), ('version', version)]) if self._sql_index: cursor.execute(''' CREATE INDEX key_index ON MDX_DICT (key) ''') conn.commit() conn.close()
def collins(filename): mdx = MDX(filename) items = mdx.items() for i in range(12): item = next(items) print(item[0]) dom = etree.HTML(item[1]) explains = dom.xpath(r'//*[@class="C1_explanation_item"]') for e in explains: seealso = e.xpath(r'div/*/a[@class="C1_explain"]/text()') if len(seealso) != 0: print('see also', seealso) else: cn = e.xpath(r'div/span[@class="C1_explanation_label"]/text()') + e.xpath(r'div/span[@class="C1_text_blue"]/text()') print(''.join(cn)) entemp = e.xpath(r'div/text()') enword = e.xpath(r'div/span[@class="C1_inline_word"]/text()') en = [] for i in range(len(enword)): en.append(entemp[i]) en.append(enword[i]) en.append(entemp.pop()) en = e.xpath(r'div/span[@class="C1_word_gram"]/text()') + en print(''.join(en)) sentence = e.xpath(r'ul/li') for sen in sentence: entemp = sen.xpath(r'p[1]/text()') enword = sen.xpath(r'p[1]/span[@class="C1_text_blue"]/text()') sentence_en = [] for i in range(len(enword)): sentence_en.append(entemp[i]) sentence_en.append(enword[i]) print(entemp) sentence_en.append(entemp.pop()) print(''.join(sentence_en)) sentence_cn = sen.xpath(r'p[2]/text()') print(''.join(sentence_cn))
# -*- coding: utf-8 -*- from collections import Counter import re import logging from bs4 import BeautifulSoup from readmdict import MDX, MDD dirstring = "资料来源整理/资料来源整理/" mdx = MDX("21世纪大英汉词典.mdx") r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?" r1 = r"`\d`" #logging.basicConfig() logger = logging.getLogger('root') fh = logging.FileHandler('test.log') fh.setLevel(logging.INFO) logger.addHandler(fh) pattern = re.compile(r) pattern1 = re.compile(r1) dic = {} valuelist = [] for key, value in mdx.items(): try: soup = BeautifulSoup(value) #print soup.prettify() phone = soup.find("span", class_="phone") tree = soup.find_all("span", class_="trs") # print key synonym = soup.find("span", class_="syno") #同义词 antonym = soup.find("span", class_="anto")
# -*- coding: utf-8 -*- from collections import Counter import re import logging from bs4 import BeautifulSoup from readmdict import MDX,MDD dirstring="资料来源整理/资料来源整理/" mdx=MDX("21世纪大英汉词典.mdx") r=r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?" r1=r"`\d`" #logging.basicConfig() logger = logging.getLogger('root') fh = logging.FileHandler('test.log') fh.setLevel(logging.INFO) logger.addHandler(fh) pattern=re.compile(r) pattern1=re.compile(r1) dic={} valuelist=[] for key,value in mdx.items(): try: soup=BeautifulSoup(value) #print soup.prettify() phone= soup.find("span",class_="phone") tree=soup.find_all("span",class_="trs") # print key synonym=soup.find("span",class_="syno") #同义词 antonym=soup.find("span",class_="anto") #反义词
import logging from collections import Counter import re import json import psycopg2 conn = psycopg2.connect(database="word", user="******", password="******", host="rm-2zeg1e0w5v7w5v7y8o.pg.rds.aliyuncs.com", port="3432") cur = conn.cursor() logging.basicConfig() from bs4 import BeautifulSoup from readmdict import MDX, MDD dirstring = "资料来源整理/资料来源整理/" mdx = MDX("牛津英汉简明词典.mdx") r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?" r1 = r"`\d`" pattern = re.compile(r) pattern1 = re.compile(r1) dic = {} for key, value in mdx.items(): valuelist = [] match = pattern.sub("", value) #print match items = match.split("<br>") items = [item.strip() for item in items] for item in items: item = pattern1.sub("", item) valuelist.append(item) print item
#!/bin/python # -*- coding: utf-8 -*- from readmdict import MDX#or MDD #You need to install lzo Library for gcc and python-lzo,readmdict,ripemd128 and pureSalsa20 Library for python import csv import re csvaddress='XXXX.csv' #Your CSV file name to export def remove_html_tags(text): """Remove html tags from a string""" clean = re.compile('<.*?>') return re.sub(clean, '', text) mdx= MDX('XXXX.mdx') #Open the original MDX Dictionary file items = mdx.items() tx=mdx.items().next() #To fetch first line in your mdx file - If delete this line you miss first line lst=[] for counter in items: try: templst=list(tx) templst[1] = remove_html_tags(templst[1]) lst.append(templst) tx=items.next() except: with open(csvaddress,'w') as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(lst)
print(f"[x] {path} 已存在!") continue else: print(f"[ ] {path}", end='\r') path_tuple = path.split('\\') path_str = str(os.path.join( "pages", *path_tuple[:-1])) if len(path_tuple) > 1 else '' if path_str != '' and not os.path.exists(path_str): os.mkdir(path_str) src_file = open(os.path.join("pages", path), 'wb') src_file.write(data) src_file.flush() src_file.close() print("[v]") elif sys.argv[1].split('.')[-1] == "mdx": mdx_file = MDX(sys.argv[1], encoding='utf-8') index_info = {} print("正在提取词典页面文件...") for mdx_item in mdx_file.items(): item_name = base64.urlsafe_b64encode( mdx_item[0]).decode() + '.html' item_content = mdx_item[1].decode() page_path = os.path.join("pages", item_name) if os.path.exists(page_path): print(f"[x] {page_path} 已存在!") else: print(f"[ ] {page_path}", end='\r') if "@@@LINK=" in item_content: link = item_content[8:] item_content = "<a href=\"{}\">{}</a>".format( base64.urlsafe_b64encode(link.encode()).decode(), link)