def generate_soup(word): """产出所查询单词对应的 BeautifulSoup 实例""" builder = IndexBuilder('柯林斯高阶双解.mdx') the_word = builder.mdx_lookup(str(word))[0] return BeautifulSoup(the_word, features='lxml')
def search_word_in_dict(word: str, dict: str, morphology: bool = True): global logger word = word.strip(' \n') words = [word] if morphology: hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') if hobj.spell(word) and hobj.stem(word): words = [b.decode() for b in hobj.stem(word)] logger.debug('Get stems: {}.'.format(', '.join(words))) builder = IndexBuilder(dict) builder.check_build() for w in words: meanings = builder.mdx_lookup(w, ignorecase=True) if not meanings: continue logger.debug('Find {} meanings of word {} from dictionary {}.'.format( len(meanings), w, dict)) if w != word: word = w return word, meanings[0] logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict)) return word, None
def indexing(builder: IndexBuilder) -> int: """indexing all examples in lsc4 dict TODO: 性能很差,indexing动作应该放在解析mdx文件的时候 :param builder dict builder """ if not USE_ES or not CONNECTED_ES: return 0 # create index if not create_index(): return 0 conn = sqlite3.connect(builder.get_mdx_db()) cursor = conn.execute('SELECT key_text FROM MDX_INDEX') keys = [item[0] for item in cursor] conn.close() examples = [] for key in keys: content = builder.mdx_lookup(key) str_content = "" if len(content) > 0: for c in content: str_content += c.replace("\r\n", "").replace("entry:/", "") exs = example_parse_lsc4("lsc4", key, str_content) if exs: examples.extend(exs) if len(examples) > 100000: ingest("lsc4", examples) examples = [] ingest("lsc4", examples) print("indexing done", len(keys))
def test_builder_index(self): '''test basic function''' for f in glob.glob("mdx/Vocabulary*.db"): os.remove(f) print("***with sql index***\n") start = time.time() bd = IndexBuilder(self._mdx_file, sql_index=True, check=False) print( "takes {0} seconds to build with sql index\n".format(time.time() - start)) start = time.time() word = 'dedicate' for i in range(self._repeat): bd.mdx_lookup(word) print("takes {0} second to lookup {1} {2} times\n".format( time.time() - start, word, self._repeat)) for i in range(self._repeat): bd.get_mdx_keys("dedi*") print("takes {0} second to lookup {1} {2} times\n".format( time.time() - start, "dedi*", self._repeat))
import re from collections import namedtuple import bs4 from bs4 import BeautifulSoup from mdict_query import IndexBuilder # 测试 builder = IndexBuilder('柯林斯高阶双解.mdx') with open('./dicta.html', 'w+') as wp: wp.write(builder.mdx_lookup('great')[0]) #soup = BeautifulSoup(builder.mdx_lookup('f**k')[0], features="lxml") # TODO: 将结果写入数据库并进行初步测试 Word_Title = namedtuple('Word_Title', 'name star level') Word_Collins_Content = namedtuple( 'Word_Collins_Content', 'interpretation usage usage_note word_format') Word_Interpretation = namedtuple('Word_Interpretation', 'en cn') Word_Usage = namedtuple('Word_Usage', 'description examples') Word_Usage_Note = namedtuple('Word_Usage_Note', 'en cn') Word_Format = namedtuple('Word_Format', 'format examples') def generate_soup(word): """产出所查询单词对应的 BeautifulSoup 实例""" builder = IndexBuilder('柯林斯高阶双解.mdx') the_word = builder.mdx_lookup(str(word))[0]
#if sys.argv.__len__() < 3: # sys.exit(1) #print sys.argv dict = {} query_type = sys.argv[2] query_word = sys.argv[3].strip() builder = IndexBuilder('/Users/david/Desktop/G/ciku/Longman Dictionary of Contemporary English.mdx') if query_type == "key": dict[query_word] = builder.mdx_lookup(query_word, True) elif query_type == "wildcard": keys = builder.get_mdx_keys(query_word) count = 0 for key in keys: count += 1 dict[key] = builder.mdx_lookup(key) if count > 10: break elif query_type == "wildcardcount": keys = builder.get_mdx_keys(query_word) print keys.__len__() sys.exit(0) print json.dumps(dict)
return newnote if __name__ == '__main__': config = configparser.ConfigParser() # get absolute path fp_dir = os.path.dirname(os.path.realpath(sys.argv[0])) iniFile = os.path.join(fp_dir, "Config.ini") # print(iniFile) config.read(iniFile, encoding='utf-8') mdict = config['Default']['mdxfile'] builder = IndexBuilder(mdict) # Word="abandon" Word = sys.argv[1] Meanings = builder.mdx_lookup(Word, ignorecase=True) record = Meanings[0] CardNote = NoteContent(Word, record) # print(CardNote) # t3=time.time() newnote = json.loads(CardNote, strict=False) # print(newnote) # t4=time.time() try: result = invoke('addNote', note=newnote) print(result) winsound.Beep(440, 250) # frequency, duration except: winsound.Beep(600, 250)
from mdict_query import IndexBuilder bd = IndexBuilder("mdx\\oed.mdx") keys = bd.get_mdx_keys("ded*") result = bd.mdx_lookup('a') pass