Ejemplo n.º 1
0
def generate_soup(word):
    """产出所查询单词对应的 BeautifulSoup 实例"""

    builder = IndexBuilder('柯林斯高阶双解.mdx')
    the_word = builder.mdx_lookup(str(word))[0]

    return BeautifulSoup(the_word, features='lxml')
Ejemplo n.º 2
0
def search_word_in_dict(word: str, dict: str, morphology: bool = True):
    global logger

    word = word.strip(' \n')
    words = [word]
    if morphology:
        hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                 '/usr/share/hunspell/en_US.aff')
        if hobj.spell(word) and hobj.stem(word):
            words = [b.decode() for b in hobj.stem(word)]
            logger.debug('Get stems: {}.'.format(', '.join(words)))

    builder = IndexBuilder(dict)
    builder.check_build()
    for w in words:
        meanings = builder.mdx_lookup(w, ignorecase=True)
        if not meanings:
            continue
        logger.debug('Find {} meanings of word {} from dictionary {}.'.format(
            len(meanings), w, dict))
        if w != word:
            word = w
        return word, meanings[0]
    logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict))
    return word, None
Ejemplo n.º 3
0
def indexing(builder: IndexBuilder) -> int:
    """indexing all examples in lsc4 dict
    TODO: 性能很差,indexing动作应该放在解析mdx文件的时候
    :param builder dict builder
    """
    if not USE_ES or not CONNECTED_ES:
        return 0

    # create index
    if not create_index():
        return 0
    conn = sqlite3.connect(builder.get_mdx_db())
    cursor = conn.execute('SELECT key_text FROM MDX_INDEX')
    keys = [item[0] for item in cursor]
    conn.close()

    examples = []

    for key in keys:
        content = builder.mdx_lookup(key)
        str_content = ""
        if len(content) > 0:
            for c in content:
                str_content += c.replace("\r\n", "").replace("entry:/", "")
        exs = example_parse_lsc4("lsc4", key, str_content)
        if exs:
            examples.extend(exs)
            if len(examples) > 100000:
                ingest("lsc4", examples)
                examples = []
    ingest("lsc4", examples)
    print("indexing done", len(keys))
Ejemplo n.º 4
0
    def test_builder_index(self):
        '''test basic function'''
        for f in glob.glob("mdx/Vocabulary*.db"):
            os.remove(f)
        print("***with sql index***\n")
        start = time.time()
        bd = IndexBuilder(self._mdx_file, sql_index=True, check=False)
        print(
            "takes {0} seconds to build with sql index\n".format(time.time() -
                                                                 start))

        start = time.time()
        word = 'dedicate'
        for i in range(self._repeat):
            bd.mdx_lookup(word)
        print("takes {0} second to lookup {1} {2} times\n".format(
            time.time() - start, word, self._repeat))

        for i in range(self._repeat):
            bd.get_mdx_keys("dedi*")
        print("takes {0} second to lookup {1} {2} times\n".format(
            time.time() - start, "dedi*", self._repeat))
Ejemplo n.º 5
0
import re
from collections import namedtuple

import bs4
from bs4 import BeautifulSoup

from mdict_query import IndexBuilder

# 测试
builder = IndexBuilder('柯林斯高阶双解.mdx')
with open('./dicta.html', 'w+') as wp:
    wp.write(builder.mdx_lookup('great')[0])

#soup = BeautifulSoup(builder.mdx_lookup('f**k')[0], features="lxml")
# TODO: 将结果写入数据库并进行初步测试

Word_Title = namedtuple('Word_Title', 'name star level')
Word_Collins_Content = namedtuple(
    'Word_Collins_Content', 'interpretation usage usage_note word_format')
Word_Interpretation = namedtuple('Word_Interpretation', 'en cn')
Word_Usage = namedtuple('Word_Usage', 'description examples')
Word_Usage_Note = namedtuple('Word_Usage_Note', 'en cn')
Word_Format = namedtuple('Word_Format', 'format examples')


def generate_soup(word):
    """产出所查询单词对应的 BeautifulSoup 实例"""

    builder = IndexBuilder('柯林斯高阶双解.mdx')
    the_word = builder.mdx_lookup(str(word))[0]
Ejemplo n.º 6
0

#if sys.argv.__len__() < 3:
#    sys.exit(1)

#print sys.argv

dict = {}

query_type = sys.argv[2]
query_word = sys.argv[3].strip()

builder = IndexBuilder('/Users/david/Desktop/G/ciku/Longman Dictionary of Contemporary English.mdx')

if query_type == "key":
    dict[query_word] = builder.mdx_lookup(query_word, True)
elif query_type == "wildcard":
    keys = builder.get_mdx_keys(query_word)
    count = 0
    for key in keys:
        count += 1
        dict[key] = builder.mdx_lookup(key)
        if count > 10:
            break
elif query_type == "wildcardcount":
    keys = builder.get_mdx_keys(query_word)
    print keys.__len__()
    sys.exit(0)


print json.dumps(dict)
Ejemplo n.º 7
0
    return newnote


if __name__ == '__main__':
    config = configparser.ConfigParser()
    # get absolute path
    fp_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
    iniFile = os.path.join(fp_dir, "Config.ini")
    #    print(iniFile)
    config.read(iniFile, encoding='utf-8')
    mdict = config['Default']['mdxfile']
    builder = IndexBuilder(mdict)
    #    Word="abandon"
    Word = sys.argv[1]
    Meanings = builder.mdx_lookup(Word, ignorecase=True)
    record = Meanings[0]

    CardNote = NoteContent(Word, record)
    #    print(CardNote)
    #    t3=time.time()
    newnote = json.loads(CardNote, strict=False)
    #    print(newnote)
    #    t4=time.time()
    try:
        result = invoke('addNote', note=newnote)
        print(result)
        winsound.Beep(440, 250)  # frequency, duration
    except:
        winsound.Beep(600, 250)
Ejemplo n.º 8
0
from mdict_query import IndexBuilder

bd = IndexBuilder("mdx\\oed.mdx")
keys = bd.get_mdx_keys("ded*")
result = bd.mdx_lookup('a')
pass