예제 #1
0
def basic_example_juman_2x():
    # input is `unicode` type(in python2x)
    sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    juman_wrapper = JumanWrapper()
    tokenized_objects = juman_wrapper.tokenize(sentence=sentence,
                                               normalize=True,
                                               return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print(u'Juman Demo')
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    juman_wrapper = JumanWrapper(server='localhost', port=32000)
    tokens_list = juman_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokens_list, list)

    # filtering is same as mecab
    filtered_result = JumanWrapper().tokenize(
        sentence, return_list=False).filter(
            pos_condition=[(u'名詞', )]).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
예제 #2
0
class JumanTokenizer:
    def __init__(self):
        self.tokenizer = JumanWrapper()

    def tokenize(self, text):
        tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects
        return [
            dict(analyzed_line=obj.analyzed_line,
                 word_surface=obj.word_surface,
                 word_stem=obj.word_stem,
                 pos=list(obj.tuple_pos),
                 misc_info=obj.misc_info) for obj in tokenized_objects
        ]
예제 #3
0
def basic_example_juman_3x():
    # input is `str` type(in python3x)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    juman_wrapper = JumanWrapper()
    tokenized_objects = juman_wrapper.tokenize(sentence=sentence,
                                               normalize=True,
                                               return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print('Juman Demo')
    print(tokenized_objects.convert_list_object())
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print('word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    HOST = 'localhost'
    PORT = 32000
    try:
        s.connect((HOST, PORT))
        s.close()
        juman_wrapper = JumanWrapper(server=HOST, port=PORT)
        tokens_list = juman_wrapper.tokenize(sentence=sentence,
                                             return_list=True)
        assert isinstance(tokens_list, list)
        # filtering is same as mecab
        filtered_result = JumanWrapper(
            server='localhost',
            port=32000).tokenize(sentence, return_list=False).filter(
                pos_condition=[('名詞', )]).convert_list_object()
        assert isinstance(filtered_result, list)
        print(filtered_result)
    except:
        logger.info(msg='Juman server is not running. Skip it.')
def basic_example_juman_2x():
    # input is `unicode` type(in python2x)
    sentence = u'テヘラン(ペルシア語: تهران  ; Tehrān Tehran.ogg 発音[ヘルプ/ファイル]/teɦˈrɔːn/、英語:Tehran)は、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    juman_wrapper = JumanWrapper()
    tokenized_objects = juman_wrapper.tokenize(
        sentence=sentence,
        normalize=True,
        return_list=False
    )
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-'*30)
    print(u'Juman Demo')
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print(u'word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem,
            token_object.word_surface,
            token_object.tuple_pos))