Ejemplos de DCSCopyTextDefinition.find en Python

Lenguaje de programación: Python

Namespace/Package Name: DictionaryServices

Método / Función: find

Ejemplos en hotexamples.com: 4

Python DCSCopyTextDefinition.find - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de DictionaryServices.DCSCopyTextDefinition.find extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

DCSCopyTextDefinition(9)

encode(2)

find(2)

split(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: systemdict.py Proyecto: jsyao88/alfred-dict-workflow

def lookup(word, external_cmd=True, cmd=DEFAULT_CMD, dict_name=DEFAULT_DICT_NAME):
    if external_cmd:
        if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
            if dict_name == 'oxford':
                dict_path = '/Library/Dictionaries/Simplified Chinese - English.dictionary'
            elif dict_name == 'landau':
                dict_file = 'langdao-ec-gb.dictionary' if is_english(word) else 'langdao-ce-gb.dictionary'
                dict_path = os.path.expanduser('~/Library/Dictionaries/{}'.format(dict_file))
            else:
                raise DictLookupError('dict name not valid.')
            proc = subprocess.Popen([cmd, dict_path, word], stdout=subprocess.PIPE)
            definition = proc.stdout.read()
            if definition == '(null)\n':
                return []
            definition = definition.decode('utf-8')
        else:
            raise DictLookupError('file {} not found or not executable.'.format(cmd))
    else:
        from DictionaryServices import DCSCopyTextDefinition
        unicode_word = word.decode('utf-8')
        word_range = (0, len(unicode_word))
        definition = DCSCopyTextDefinition(None, unicode_word, word_range)
        if definition is None:
            return []

    result = []
    if dict_name == 'oxford':
        is_eng = is_english(word)

        number = u'①-⑳㉑-㉟㊱-㊿'
        chinese = ur'\u4e00-\u9fa5'
        pinyin = u'āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň'
        phrase = r"a-zA-Z,\. "
        sentence = ur"0-9a-zA-Z'‘’«»£\$/\?!,\.\[\]\(\) "
        pinyin_all = u"a-zA-Z{}'… ".format(pinyin)
        sentence_full = ur'([{1}][{0}]*[{1}]|\([{0}]*[{1}]|[{1}][{0}]*\)) ?[{2}]+'.format(
            sentence, sentence.replace(r'\(\) ', ''), chinese)

        part_map = {
            'noun': 'n.',
            'intransitive verb': 'vi.',
            'transitive verb': 'vt.',
            'adjective': 'adj.',
            'adverb': 'adv.',
            'determiner': 'det.',
            'pronoun': 'pron.',
            'preposition': 'prep.',
            'conjunction': 'conj.',
            'exclamation': 'excl.',
            'abbreviation': 'abbr.',
            'noun plural': 'pl.',
            'modifier': 'mod.'
        } if is_eng else {
            u'名词': u'n.',
            u'动词': u'v.',
            u'形容词': u'adj.',
            u'副词': u'adv.',
            u'数词': u'num.',
            u'代词': u'pron.',
            u'介词': u'prep.',
            u'连词': u'conj.',
            u'叹词': u'excl.'
        }

        ignore_list = [
            'Countable and uncountable', 'Uncountable and countable',
            'Countable', 'Uncountable', 'British', 'American',
            'colloquial', 'euphemistic', 'dated', 'Linguistics'
        ] if is_eng else [
            u'方言', u'客套话', u'委婉语', u'书面语', u'俗语', u'比喻义', u'口语', u'惯用语'
        ]

        phrase_mode = False
        if is_eng:
            word_escaped = re.escape(word)
            if not re.match(word_escaped + '(?= )', definition, re.I):
                verb_escaped = re.escape(word.split(' ')[0])
                if not re.match(verb_escaped + '(?= )', definition, re.I):
                    return result
                phrase_mode = True
            pos = definition.find('PHRASAL VERB')
            if phrase_mode:
                if pos == -1:
                    return result
                definition = definition[pos:]
                match = re.search(r'(({0}:? )([A-Z]\. )?({1}).*?)(?=\b{2} [{3}]*?:? ([A-Z]\. )?({1}))'.format(
                    word_escaped, '|'.join(part_map.keys()), verb_escaped, phrase), definition)
                if match is None:
                    return result
                definition = match.group(1)
                start_pos = len(match.group(2))
            else:
                if pos != -1:
                    definition = definition[:pos]

        if phrase_mode:
            result.append(word)
        else:
            trimmed_len = 0
            single_phonetic = True
            if is_eng:
                phonetics = []
                for match in re.finditer(r'[A-Z]\. \|(.*?)\| ?', definition):
                    phonetic = match.group(1).encode('utf-8').strip()
                    phonetic = '/{}/'.format(phonetic)
                    if phonetic not in phonetics:
                        phonetics.append(phonetic)
                    start = match.start() + 3 - trimmed_len
                    end = match.end() - trimmed_len
                    definition = definition[:start] + definition[end:]
                    trimmed_len += end - start
                if len(phonetics) > 0:
                    phonetics = ', '.join(phonetics)
                    result.append('{} {}'.format(word, phonetics))
                    single_phonetic = False
            if single_phonetic:
                match = re.search(r'\|(.*?)\| ?'
                                  if is_eng else
                                  ur'([^ ]*[{}][^ ]*) ?'.format(pinyin),
                                  definition)
                if match is None:
                    return result
                phonetic = match.group(1).encode('utf-8').strip()
                result.append('{}{}'.format(word, ' /{}/'.format(phonetic) if phonetic else ''))
                start_pos = match.span()[1]

        part_list = []
        pattern = (r'({}) ?(\(.*?\))? ?'.format('|'.join(part_map.keys()))
                   if is_eng else
                   ur'({}) '.format('|'.join(part_map.keys())))

        if 'A. ' not in definition:
            match = re.match(pattern, definition[start_pos:])
            if match:
                part_list.append((start_pos, start_pos + match.span()[1], part_map[match.group(1)]))
        else:
            for match in re.finditer(ur'[A-Z]\. {}'.format(pattern), definition):
                part_list.append((match.start(), match.end(), part_map[match.group(1)]))

        last_start_pos = len(definition)
        pattern = (ur"([^{4}]*?([{0}][{1}]*? |[{2}]*?(\) |›)))(?=({3}|[{4}]|$))".format(pinyin, pinyin_all, phrase, sentence_full, number)
                   if is_eng else
                   ur"(?![a-z] )([^{2}]*?[{0}]* )(?=([→{1}{2}]|$))".format(phrase, chinese, number))

        for part in reversed(part_list):
            entry_list = []
            text = definition[part[1]:last_start_pos]
            if u'① ' not in text:
                match = re.match(pattern, text)
                if match:
                    entry_list.append(match.group(1))
            else:
                for match in re.finditer(ur'[{}] {}'.format(number, pattern), text):
                    entry_list.append(match.group(1))

            pos = 1
            for entry in entry_list:
                entry = re.sub(ur'[{0}]*[{1}][{0}]*'.format(pinyin_all, pinyin)
                               if is_eng else
                               r'\[used .*?\]',
                               '', entry)
                entry = re.sub(ur'({})'.format('|'.join(ignore_list)), '', entry)
                entry = re.sub(r'\([ /]*\)', '', entry)
                entry = re.sub(r' {2,}', ' ', entry).strip()
                if is_eng:
                    entry = entry.replace(u' ;', u';')
                entry = (u'{} {}'.format(part[2], entry)).encode('utf-8')
                result.insert(pos, entry)
                pos += 1

            last_start_pos = part[0]

Ejemplo n.º 2

Mostrar archivo

Archivo: oxford.py Proyecto: andyzhuangyy/alfred-dict-workflow

def lookup(word, external_cmd=True, parse_html=True, *args):
    if external_cmd:
        cmd = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'systemdict')
        if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
            dict_name = '牛津英汉汉英词典'
            proc = subprocess.Popen([cmd, '-t', 'html' if parse_html else 'text', '-d', dict_name, word],
                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            definition = proc.stdout.read()
            if definition.strip() == '':
                return []
            if not parse_html:
                definition = definition.decode('utf-8')
        else:
            raise DictLookupError('file {} not found or not executable.'.format(cmd))
    else:
        from DictionaryServices import DCSCopyTextDefinition
        unicode_word = word.decode('utf-8')
        word_range = (0, len(unicode_word))
        definition = DCSCopyTextDefinition(None, unicode_word, word_range)
        if definition is None:
            return []

    result = []
    is_eng = is_english(word)
    mac_ver = StrictVersion(platform.mac_ver()[0])

    part_map = {
        'noun': 'n.',
        'intransitive verb': 'vi.',
        'transitive verb': 'vt.',
        'reflexive verb': 'vr.',
        'adjective': 'adj.',
        'adverb': 'adv.',
        'determiner': 'det.',
        'pronoun': 'pron.',
        'preposition': 'prep.',
        'conjunction': 'conj.',
        'exclamation': 'excl.',
        'abbreviation': 'abbr.',
        'noun plural': 'pl.',
        'modifier': 'mod.'
    } if is_eng else {
        u'名' if mac_ver >= StrictVersion('10.10') else u'名词': u'n.',
        u'动' if mac_ver >= StrictVersion('10.10') else u'动词': u'v.',
        u'形' if mac_ver >= StrictVersion('10.10') else u'形容词': u'adj.',
        u'副' if mac_ver >= StrictVersion('10.10') else u'副词': u'adv.',
        u'数' if mac_ver >= StrictVersion('10.10') else u'数词': u'num.',
        u'代' if mac_ver >= StrictVersion('10.10') else u'代词': u'pron.',
        u'介' if mac_ver >= StrictVersion('10.10') else u'介词': u'prep.',
        u'连' if mac_ver >= StrictVersion('10.10') else u'连词': u'conj.',
        u'叹' if mac_ver >= StrictVersion('10.10') else u'叹词': u'excl.'
    }

    if external_cmd and parse_html:
        # use ElementTree to parse html
        ns = '{http://www.apple.com/DTDs/DictionaryService-1.0.rng}'
        phonetic_spans = []
        for xml in definition.split('<?xml')[1:]:
            html = ET.fromstring('<?xml' + xml)
            entry = html.find('.//{}entry'.format(ns))

            word_span = entry.find("./span[@class='hwg']/span[@{}dhw]".format(ns))
            if word_span is None:
                continue
            if word_span.text.encode('utf-8').lower() == word.lower():
                root = entry
                phonetic_span = entry.find("./span[@class='hwg']/span[@class='pr']".format(ns))
                if phonetic_span is not None:
                    phonetic_spans.append(phonetic_span)
            else:
                if not is_eng:
                    continue
                phrase_span = entry.find("./span[@class='pvb']//span[@class='pvg'][span='{}']/..".format(word))
                if phrase_span is None:
                    continue
                root = phrase_span

            for span1 in root.findall("./span[@lexid]"):
                phonetic_span = span1.find("./span[@class='pr']")
                if phonetic_span is not None:
                    phonetic_spans.append(phonetic_span)

                part_span = span1.find("./span[@class='ps']")
                if part_span is None:
                    continue
                part = part_map.get(part_span.text, part_span.text + '.')
                for span2 in span1.findall("./span[@lexid]"):
                    item = ''
                    for span3 in span2.findall('./span[@class]'):
                        if span3.attrib['class'] == 'ind':
                            item += ''.join(span3.itertext())
                        elif span3.attrib['class'] == 'trg':
                            for span4 in span3.findall('./span[@class]'):
                                if span4.attrib['class'] != 'trans ty_pinyin':
                                    item += ''.join(span4.itertext())

                    item = re.sub(r' {2,}', ' ', item).strip()
                    if item:
                        item = u'{} {}'.format(part, item).encode('utf-8')
                        result.append(item)

        phonetics = []
        for phonetic_span in phonetic_spans:
            phonetic = ''.join(phonetic_span.itertext())
            phonetic = re.sub(r' {2,}', ' ', phonetic).encode('utf-8').strip(' |')
            phonetic = '/{}/'.format(phonetic)
            if phonetic not in phonetics:
                phonetics.append(phonetic)
        phonetics = ', '.join(phonetics)
        if phonetics or len(result) > 0:
            result.insert(0, '{} {}'.format(word, phonetics))

    else:
        # use regular expression to parse text
        number = u'①-⑳㉑-㉟㊱-㊿'
        chinese = ur'\u4e00-\u9fa5'
        pinyin = u'āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň'
        phrase = r"a-zA-Z,\. "
        sentence = ur"0-9a-zA-Z'‘’«»£\$/\?!,\.\[\]\(\) "
        pinyin_all = u"a-zA-Z{}'… ".format(pinyin)
        sentence_full = ur'([{1}][{0}]*[{1}]|\([{0}]*[{1}]|[{1}][{0}]*\)) ?[{2}]+'.format(
            sentence, sentence.replace(r'\(\) ', ''), chinese)

        ignore_list = [
            'Countable and uncountable', 'Uncountable and countable',
            'Countable', 'Uncountable', 'British', 'American',
            'colloquial', 'euphemistic', 'dated', 'Linguistics'
        ] if is_eng else [
            u'方言', u'客套话', u'委婉语', u'书面语', u'俗语', u'比喻义',
            u'口语', u'惯用语', u'旧词', u'敬辞'
        ]

        phrase_mode = False
        if is_eng:
            word_escaped = re.escape(word)
            if not re.match(word_escaped + '(?= )', definition, re.I):
                verb_escaped = re.escape(word.split(' ')[0])
                if not re.match(verb_escaped + '(?= )', definition, re.I):
                    return result
                phrase_mode = True
            pos = definition.find('PHRASAL VERB')
            if phrase_mode:
                if pos == -1:
                    return result
                definition = definition[pos:]
                match = re.search(r'(({0}:? )([A-Z]\. )?({1}).*?)(?=\b{2} [{3}]*?:? ([A-Z]\. )?({1}))'.format(
                    word_escaped, '|'.join(part_map.keys()), verb_escaped, phrase), definition)
                if match is None:
                    return result
                definition = match.group(1)
                start_pos = len(match.group(2))
            else:
                if pos != -1:
                    definition = definition[:pos]

        if phrase_mode:
            result.append(word)
        else:
            trimmed_len = 0
            single_phonetic = True
            if is_eng:
                phonetics = []
                for match in re.finditer(r'[A-Z]\. \|(.*?)\| ?', definition):
                    phonetic = match.group(1).encode('utf-8').strip()
                    phonetic = '/{}/'.format(phonetic)
                    if phonetic not in phonetics:
                        phonetics.append(phonetic)
                    start = match.start() + 3 - trimmed_len
                    end = match.end() - trimmed_len
                    definition = definition[:start] + definition[end:]
                    trimmed_len += end - start
                if len(phonetics) > 0:
                    phonetics = ', '.join(phonetics)
                    result.append('{} {}'.format(word, phonetics))
                    single_phonetic = False
            if single_phonetic:
                match = re.search(r'\|(.*?)\| ?'
                                  if is_eng else
                                  ur'([^ ]*[{}][^ ]*) ?'.format(pinyin),
                                  definition)
                if match is None:
                    return result
                phonetic = match.group(1).encode('utf-8').strip()
                result.append('{}{}'.format(word, ' /{}/'.format(phonetic) if phonetic else ''))
                start_pos = match.span()[1]

        part_list = []
        pattern = (r'({}) ?(\(.*?\))? ?'.format('|'.join(part_map.keys()))
                   if is_eng else
                   ur'({}) '.format('|'.join(part_map.keys())))

        if 'A. ' not in definition:
            match = re.match(pattern, definition[start_pos:])
            if match:
                part_list.append((start_pos, start_pos + match.span()[1], part_map[match.group(1)]))
        else:
            for match in re.finditer(ur'[A-Z]\. {}'.format(pattern), definition):
                part_list.append((match.start(), match.end(), part_map[match.group(1)]))

        last_start_pos = len(definition)
        pattern = (ur"([^{4}]*?([{0}][{1}]*? |[{2}]*?(\) |›)))(?=({3}|[{4}]|$))".format(pinyin, pinyin_all, phrase, sentence_full, number)
                   if is_eng else
                   ur"(?![a-z] )([^{2}]*?[{0}]* )(?=([→{1}{2}]|$))".format(phrase, chinese, number))

        for part in reversed(part_list):
            entry_list = []
            text = definition[part[1]:last_start_pos]
            if u'① ' not in text:
                match = re.match(pattern, text)
                if match:
                    entry_list.append(match.group(1))
            else:
                for match in re.finditer(ur'[{}] {}'.format(number, pattern), text):
                    entry_list.append(match.group(1))

            pos = 1
            for entry in entry_list:
                entry = re.sub(ur'[{0}]*[{1}][{0}]*'.format(pinyin_all, pinyin)
                               if is_eng else
                               r'\[used .*?\]',
                               '', entry)
                entry = re.sub(ur'({})'.format('|'.join(ignore_list)), '', entry)
                entry = re.sub(r'\([ /]*\)', '', entry)
                entry = re.sub(r' {2,}', ' ', entry).strip()
                if is_eng:
                    entry = entry.replace(u' ;', u';')
                entry = (u'{} {}'.format(part[2], entry)).encode('utf-8')
                result.insert(pos, entry)
                pos += 1

            last_start_pos = part[0]

Ejemplo n.º 3

Mostrar archivo

Archivo: oxford.py Proyecto: hansomesong/alfred-dict-workflow

def lookup(word, external_cmd=True, parse_html=True, *args):
    mac_ver = StrictVersion(platform.mac_ver()[0])

    if external_cmd:
        cmd = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),
                             'systemdict')
        if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
            dict_name = '牛津英汉汉英词典' if mac_ver >= StrictVersion(
                '10.10') else 'Oxford Chinese Dictionary'
            proc = subprocess.Popen([
                cmd, '-t', 'html' if parse_html else 'text', '-d', dict_name,
                word
            ],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            definition = proc.stdout.read()
            if definition.strip() == '':
                return []
            if not parse_html:
                definition = definition.decode('utf-8')
        else:
            raise DictLookupError(
                'file {} not found or not executable.'.format(cmd))
    else:
        from DictionaryServices import DCSCopyTextDefinition
        unicode_word = word.decode('utf-8')
        word_range = (0, len(unicode_word))
        definition = DCSCopyTextDefinition(None, unicode_word, word_range)
        if definition is None:
            return []

    result = []
    is_eng = is_english(word)

    part_map = {
        'noun': 'n.',
        'intransitive verb': 'vi.',
        'transitive verb': 'vt.',
        'reflexive verb': 'vr.',
        'adjective': 'adj.',
        'adverb': 'adv.',
        'determiner': 'det.',
        'pronoun': 'pron.',
        'preposition': 'prep.',
        'conjunction': 'conj.',
        'exclamation': 'excl.',
        'abbreviation': 'abbr.',
        'noun plural': 'pl.',
        'modifier': 'mod.'
    } if is_eng else {
        u'名' if mac_ver >= StrictVersion('10.10') else u'名词': u'n.',
        u'动' if mac_ver >= StrictVersion('10.10') else u'动词': u'v.',
        u'形' if mac_ver >= StrictVersion('10.10') else u'形容词': u'adj.',
        u'副' if mac_ver >= StrictVersion('10.10') else u'副词': u'adv.',
        u'数' if mac_ver >= StrictVersion('10.10') else u'数词': u'num.',
        u'代' if mac_ver >= StrictVersion('10.10') else u'代词': u'pron.',
        u'介' if mac_ver >= StrictVersion('10.10') else u'介词': u'prep.',
        u'连' if mac_ver >= StrictVersion('10.10') else u'连词': u'conj.',
        u'叹' if mac_ver >= StrictVersion('10.10') else u'叹词': u'excl.'
    }

    if external_cmd and parse_html:
        # use ElementTree to parse html
        ns = '{http://www.apple.com/DTDs/DictionaryService-1.0.rng}'
        phonetic_spans = []
        for xml in definition.split('<?xml')[1:]:
            html = ET.fromstring('<?xml' + xml)
            entry = html.find('.//{}entry'.format(ns))

            word_span = entry.find(
                "./span[@class='hwg']/span[@{}dhw]".format(ns))
            if word_span is None:
                continue
            if word_span.text.encode('utf-8').lower() == word.lower():
                root = entry
                phonetic_span = entry.find(
                    "./span[@class='hwg']/span[@class='pr']".format(ns))
                if phonetic_span is not None:
                    phonetic_spans.append(phonetic_span)
            else:
                if not is_eng:
                    continue
                phrase_span = entry.find(
                    "./span[@class='pvb']//span[@class='pvg'][span='{}']/..".
                    format(word))
                if phrase_span is None:
                    continue
                root = phrase_span

            for span1 in root.findall("./span[@lexid]"):
                phonetic_span = span1.find("./span[@class='pr']")
                if phonetic_span is not None:
                    phonetic_spans.append(phonetic_span)

                part_span = span1.find("./span[@class='ps']")
                if part_span is None:
                    continue
                part = part_map.get(part_span.text, part_span.text + '.')
                for span2 in span1.findall("./span[@lexid]"):
                    item = ''
                    for span3 in span2.findall('./span[@class]'):
                        if span3.attrib['class'] == 'ind':
                            item += ''.join(span3.itertext())
                        elif span3.attrib['class'] == 'trg':
                            for span4 in span3.findall('./span[@class]'):
                                if span4.attrib['class'] != 'trans ty_pinyin':
                                    item += ''.join(span4.itertext())

                    item = re.sub(r' {2,}', ' ', item).strip()
                    if item:
                        item = u'{} {}'.format(part, item).encode('utf-8')
                        result.append(item)

        phonetics = []
        for phonetic_span in phonetic_spans:
            phonetic = ''.join(phonetic_span.itertext())
            phonetic = re.sub(r' {2,}', ' ',
                              phonetic).encode('utf-8').strip(' |')
            phonetic = '/{}/'.format(phonetic)
            if phonetic not in phonetics:
                phonetics.append(phonetic)
        phonetics = ', '.join(phonetics)
        if phonetics or len(result) > 0:
            result.insert(0, '{} {}'.format(word, phonetics))

    else:
        # use regular expression to parse text
        number = u'①-⑳㉑-㉟㊱-㊿'
        chinese = ur'\u4e00-\u9fa5'
        pinyin = u'āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň'
        phrase = r"a-zA-Z,\. "
        sentence = ur"0-9a-zA-Z'‘’«»£\$/\?!,\.\[\]\(\) "
        pinyin_all = u"a-zA-Z{}'… ".format(pinyin)
        sentence_full = ur'([{1}][{0}]*[{1}]|\([{0}]*[{1}]|[{1}][{0}]*\)) ?[{2}]+'.format(
            sentence, sentence.replace(r'\(\) ', ''), chinese)

        ignore_list = [
            'Countable and uncountable', 'Uncountable and countable',
            'Countable', 'Uncountable', 'British', 'American', 'colloquial',
            'euphemistic', 'dated', 'Linguistics'
        ] if is_eng else [
            u'方言', u'客套话', u'委婉语', u'书面语', u'俗语', u'比喻义', u'口语', u'惯用语', u'旧词',
            u'敬辞'
        ]

        phrase_mode = False
        if is_eng:
            word_escaped = re.escape(word)
            if not re.match(word_escaped + '(?= )', definition, re.I):
                verb_escaped = re.escape(word.split(' ')[0])
                if not re.match(verb_escaped + '(?= )', definition, re.I):
                    return result
                phrase_mode = True
            pos = definition.find('PHRASAL VERB')
            if phrase_mode:
                if pos == -1:
                    return result
                definition = definition[pos:]
                match = re.search(
                    r'(({0}:? )([A-Z]\. )?({1}).*?)(?=\b{2} [{3}]*?:? ([A-Z]\. )?({1}))'
                    .format(word_escaped, '|'.join(part_map.keys()),
                            verb_escaped, phrase), definition)
                if match is None:
                    return result
                definition = match.group(1)
                start_pos = len(match.group(2))
            else:
                if pos != -1:
                    definition = definition[:pos]

        if phrase_mode:
            result.append(word)
        else:
            trimmed_len = 0
            single_phonetic = True
            if is_eng:
                phonetics = []
                for match in re.finditer(r'[A-Z]\. \|(.*?)\| ?', definition):
                    phonetic = match.group(1).encode('utf-8').strip()
                    phonetic = '/{}/'.format(phonetic)
                    if phonetic not in phonetics:
                        phonetics.append(phonetic)
                    start = match.start() + 3 - trimmed_len
                    end = match.end() - trimmed_len
                    definition = definition[:start] + definition[end:]
                    trimmed_len += end - start
                if len(phonetics) > 0:
                    phonetics = ', '.join(phonetics)
                    result.append('{} {}'.format(word, phonetics))
                    single_phonetic = False
            if single_phonetic:
                match = re.search(
                    r'\|(.*?)\| ?' if is_eng else
                    ur'([^ ]*[{}][^ ]*) ?'.format(pinyin), definition)
                if match is None:
                    return result
                phonetic = match.group(1).encode('utf-8').strip()
                result.append('{}{}'.format(
                    word, ' /{}/'.format(phonetic) if phonetic else ''))
                start_pos = match.span()[1]

        part_list = []
        pattern = (r'({}) ?(\(.*?\))? ?'.format('|'.join(part_map.keys()))
                   if is_eng else ur'({}) '.format('|'.join(part_map.keys())))

        if 'A. ' not in definition:
            match = re.match(pattern, definition[start_pos:])
            if match:
                part_list.append((start_pos, start_pos + match.span()[1],
                                  part_map[match.group(1)]))
        else:
            for match in re.finditer(ur'[A-Z]\. {}'.format(pattern),
                                     definition):
                part_list.append(
                    (match.start(), match.end(), part_map[match.group(1)]))

        last_start_pos = len(definition)
        pattern = (
            ur"([^{4}]*?([{0}][{1}]*? |[{2}]*?(\) |›)))(?=({3}|[{4}]|$))".
            format(pinyin, pinyin_all, phrase, sentence_full, number) if is_eng
            else ur"(?![a-z] )([^{2}]*?[{0}]* )(?=([→{1}{2}]|$))".format(
                phrase, chinese, number))

        for part in reversed(part_list):
            entry_list = []
            text = definition[part[1]:last_start_pos]
            if u'① ' not in text:
                match = re.match(pattern, text)
                if match:
                    entry_list.append(match.group(1))
            else:
                for match in re.finditer(ur'[{}] {}'.format(number, pattern),
                                         text):
                    entry_list.append(match.group(1))

            pos = 1
            for entry in entry_list:
                entry = re.sub(
                    ur'[{0}]*[{1}][{0}]*'.format(pinyin_all, pinyin)
                    if is_eng else r'\[used .*?\]', '', entry)
                entry = re.sub(ur'({})'.format('|'.join(ignore_list)), '',
                               entry)
                entry = re.sub(r'\([ /]*\)', '', entry)
                entry = re.sub(r' {2,}', ' ', entry).strip()
                if is_eng:
                    entry = entry.replace(u' ;', u';')
                entry = (u'{} {}'.format(part[2], entry)).encode('utf-8')
                result.insert(pos, entry)
                pos += 1

            last_start_pos = part[0]

Ejemplo n.º 4

Mostrar archivo

Archivo: systemdict.py Proyecto: bigpotato920/alfred-dict-workflow

def lookup(word, external_cmd=True, cmd=DEFAULT_CMD, dict_name=DEFAULT_DICT_NAME):
    if external_cmd:
        if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
            if dict_name == 'oxford':
                dict_path = '/Library/Dictionaries/Simplified Chinese - English.dictionary'
            elif dict_name == 'landau':
                dict_file = 'langdao-ec-gb.dictionary' if is_english(word) else 'langdao-ce-gb.dictionary'
                dict_path = os.path.expanduser('~/Library/Dictionaries/{}'.format(dict_file))
            else:
                raise DictLookupError('dict name not valid.')
            proc = subprocess.Popen([cmd, dict_path, word], stdout=subprocess.PIPE)
            definition = proc.stdout.read()
            if definition == '(null)\n':
                return []
            definition = definition.decode('utf-8')
        else:
            raise DictLookupError('file {} not found or not executable.'.format(cmd))
    else:
        from DictionaryServices import DCSCopyTextDefinition
        unicode_word = word.decode('utf-8')
        word_range = (0, len(unicode_word))
        definition = DCSCopyTextDefinition(None, unicode_word, word_range)
        if definition is None:
            return []

    result = []
    if dict_name == 'oxford':
        is_eng = is_english(word)

        number = u'①-⑳㉑-㉟㊱-㊿'
        chinese = ur'\u4e00-\u9fa5'
        pinyin = u'āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň'
        phrase = r"a-zA-Z,\. "
        sentence = ur"0-9a-zA-Z'‘’«»£\$/\?!,\.\[\]\(\) "
        pinyin_all = u"a-z{}'… ".format(pinyin)
        sentence_full = ur'([{1}][{0}]*[{1}]|\([{0}]*[{1}]|[{1}][{0}]*\)) ?[{2}]+'.format(
            sentence, sentence.replace(r'\(\) ', ''), chinese)

        part_map = {
            'noun': 'n.',
            'intransitive verb': 'vi.',
            'transitive verb': 'vt.',
            'adjective': 'adj.',
            'adverb': 'adv.',
            'determiner': 'det.',
            'pronoun': 'pron.',
            'preposition': 'prep.',
            'conjunction': 'conj.',
            'exclamation': 'excl.',
            'abbreviation': 'abbr.',
            'plural noun': 'pl.',
            'modifier': 'mod.'
        } if is_eng else {
            u'名词': u'n.',
            u'动词': u'v.',
            u'形容词': u'adj.',
            u'副词': u'adv.',
            u'数词': u'num.',
            u'代词': u'pron.',
            u'介词': u'prep.',
            u'连词': u'conj.',
            u'叹词': u'excl.'
        }

        ignore_list = [
            'Uncountable and countable', 'Countable', 'Uncountable', 'British', 'American',
            'colloquial', 'euphemistic', 'dated', 'Linguistics'
        ] if is_eng else [
            u'方言', u'客套话', u'委婉语', u'书面语', u'俗语', u'比喻义', u'口语', u'惯用语'
        ]

        phrase_mode = False
        if is_eng:
            word_escaped = re.escape(word)
            if not re.match(word_escaped + '(?= )', definition, re.I):
                verb_escaped = re.escape(word.split(' ')[0])
                if not re.match(verb_escaped + '(?= )', definition, re.I):
                    return result
                phrase_mode = True
            pos = definition.find('PHRASAL VERB')
            if phrase_mode:
                if pos == -1:
                    return result
                definition = definition[pos:]
                match = re.search(r'(({0}:? )([A-Z]\. )?({1}).*?)(?=\b{2} [{3}]*?:? ([A-Z]\. )?({1}))'.format(
                    word_escaped, '|'.join(part_map.keys()), verb_escaped, phrase), definition)
                if match is None:
                    return result
                definition = match.group(1)
                start_pos = len(match.group(2))
            else:
                if pos != -1:
                    definition = definition[:pos]

        if phrase_mode:
            result.append(word)
        else:
            trimmed_len = 0
            single_phonetic = True
            if is_eng:
                phonetics = []
                for match in re.finditer(r'[A-Z]\. \|(.*?)\| ?', definition):
                    phonetic = match.group(1).encode('utf-8').strip()
                    phonetic = '/{}/'.format(phonetic)
                    if phonetic not in phonetics:
                        phonetics.append(phonetic)
                    start = match.start() + 3 - trimmed_len
                    end = match.end() - trimmed_len
                    definition = definition[:start] + definition[end:]
                    trimmed_len += end - start
                if len(phonetics) > 0:
                    phonetics = ', '.join(phonetics)
                    result.append('{} {}'.format(word, phonetics))
                    single_phonetic = False
            if single_phonetic:
                match = re.search(r'\|(.*?)\| ?'
                                  if is_eng else
                                  ur'([^ ]*[{}][^ ]*) ?'.format(pinyin),
                                  definition)
                if match is None:
                    return result
                phonetic = match.group(1).encode('utf-8').strip()
                result.append('{}{}'.format(word, ' /{}/'.format(phonetic) if phonetic else ''))
                start_pos = match.span()[1]

        part_list = []
        pattern = (r'({}) ?(\(.*?\))? ?'.format('|'.join(part_map.keys()))
                   if is_eng else
                   ur'({}) '.format('|'.join(part_map.keys())))

        if 'A. ' not in definition:
            match = re.match(pattern, definition[start_pos:])
            if match:
                part_list.append((start_pos, start_pos + match.span()[1], part_map[match.group(1)]))
        else:
            for match in re.finditer(ur'[A-Z]\. {}'.format(pattern), definition):
                part_list.append((match.start(), match.end(), part_map[match.group(1)]))

        last_start_pos = len(definition)
        pattern = (ur"([^{4}]*?([{0}][{1}]*? |[{2}]*?(\) |›)))(?=({3}|[{4}]|$))".format(pinyin, pinyin_all, phrase, sentence_full, number)
                   if is_eng else
                   ur"(?![a-z] )([^{2}]*?[{0}]* )(?=([→{1}{2}]|$))".format(phrase, chinese, number))

        for part in reversed(part_list):
            entry_list = []
            text = definition[part[1]:last_start_pos]
            if u'① ' not in text:
                match = re.match(pattern, text)
                if match:
                    entry_list.append(match.group(1))
            else:
                for match in re.finditer(ur'[{}] {}'.format(number, pattern), text):
                    entry_list.append(match.group(1))

            pos = 1
            for entry in entry_list:
                entry = re.sub(ur'[{0}]*[{1}][{0}]*'.format(pinyin_all, pinyin)
                               if is_eng else
                               r'\[used .*?\]',
                               '', entry)
                entry = re.sub(ur'({})'.format('|'.join(ignore_list)), '', entry)
                entry = re.sub(r'\([ /]*\)', '', entry)
                entry = re.sub(r' {2,}', ' ', entry).strip()
                if is_eng:
                    entry = entry.replace(u' ;', u';')
                entry = (u'{} {}'.format(part[2], entry)).encode('utf-8')
                result.insert(pos, entry)
                pos += 1

            last_start_pos = part[0]