Python JumanppWrapper Examples

Programming Language: Python

Namespace/Package Name: JapaneseTokenizer

Class/Type: JumanppWrapper

Examples at hotexamples.com: 6

Python JumanppWrapper - 6 examples found. These are the top rated real world Python examples of JapaneseTokenizer.JumanppWrapper extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

JumanppWrapper(5)

tokenize(4)

Frequently Used Methods

JumanppWrapper (5)

tokenize (4)

Example #1

Show file

def basic_example_jumanpp_3x():
    # input is `unicode` type(in python3x)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    jumanpp_wrapper = JumanppWrapper()
    tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence,
                                                 normalize=True,
                                                 return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print('Juman++ Demo')
    print(tokenized_objects.convert_list_object())
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print('word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    jumanpp_wrapper = JumanppWrapper(server='localhost', port=12000)
    tokens_list = jumanpp_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokens_list, list)
    ### Attention: Please delete instance object of sever mode when you finished using it ###
    del jumanpp_wrapper

    # filtering is same as mecab
    filtered_result = JumanppWrapper(server='localhost', port=12000).tokenize(
        sentence, return_list=False).filter(
            pos_condition=[('名詞', )]).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)

Example #2

Show file

class JumanppTokenizer:
    def __init__(self):
        self.tokenizer = JumanppWrapper()

    def tokenize(self, text):
        tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects
        return [
            dict(analyzed_line=obj.analyzed_line,
                 word_surface=obj.word_surface,
                 word_stem=obj.word_stem,
                 pos=list(obj.tuple_pos),
                 misc_info=obj.misc_info) for obj in tokenized_objects
        ]

Example #3

Show file

def basic_example_jumanpp_3x():
    # input is `unicode` type(in python3x)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    jumanpp_wrapper = JumanppWrapper(is_use_pyknp=False)
    tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence,
                                                 normalize=True,
                                                 return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print('Juman++ Demo')
    print(tokenized_objects.convert_list_object())
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print('word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    HOST = 'localhost'
    PORT = 12000
    try:
        s.connect((HOST, PORT))
        s.close()
        jumanpp_wrapper = JumanppWrapper(server=HOST, port=PORT)
        tokens_list = jumanpp_wrapper.tokenize(sentence=sentence,
                                               return_list=True)
        assert isinstance(tokens_list, list)
        ### Attention: Please delete instance object of sever mode when you finished using it ###
        del jumanpp_wrapper
        # filtering is same as mecab
        filtered_result = JumanppWrapper(server=HOST, port=PORT).tokenize(
            sentence, return_list=False).filter(
                pos_condition=[(u'名詞', )]).convert_list_object()
        assert isinstance(filtered_result, list)
    except:
        logger.info(msg='Juman++ server is not running. Skip it.')

Example #4

Show file

 def __init__(self):
     self.tokenizer = JumanppWrapper()

Example #5

Show file

#parsing the Japanese stop words
count = 0
for words in fhand:
    words = words.strip('\n')
    if words not in stop_words:
        stop_words.append(words)
punctuation = list(string.punctuation)
jp_stop_words = stop_words + stopwords.words('english') + punctuation + [
    "RT", '', ' '
]

#parsing and tokenizing twitts fro the database
output = []
#result = result.fetchall()
#conn.close()
jumanpp_wrapper = JumanppWrapper()
jp_wn = 'wnjpn-all.tab'
en_swn = 'SentiWordNet_3.0.0_20130122.txt'
classifier = Sentiment()
classifier.train(en_swn, jp_wn)
for sentences in result.fetchall():

    try:
        sentences = sentences[0]
        msg_tokenized = jumanpp_wrapper.tokenize(
            sentence=sentences, is_feature=False,
            is_surface=False).convert_list_object()
        print(msg_tokenized)
        filtered_sentences = [
            w for w in msg_tokenized if not w in jp_stop_words
        ]

Example #6

Show file

def normalize(en_lines, ja_lines):
    """ Process lists of both English and Japanese strings.
    """
    segmenter = JumanppWrapper()
    return [[normalize_en(l1), normalize_ja(l2, segmenter)]
            for l1, l2 in zip(en_lines, ja_lines)]