Example #1
0
def basic_example_jumanpp_3x():
    # input is `unicode` type(in python3x)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    jumanpp_wrapper = JumanppWrapper()
    tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence,
                                                 normalize=True,
                                                 return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print('Juman++ Demo')
    print(tokenized_objects.convert_list_object())
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print('word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    jumanpp_wrapper = JumanppWrapper(server='localhost', port=12000)
    tokens_list = jumanpp_wrapper.tokenize(sentence=sentence, return_list=True)
    assert isinstance(tokens_list, list)
    ### Attention: Please delete instance object of sever mode when you finished using it ###
    del jumanpp_wrapper

    # filtering is same as mecab
    filtered_result = JumanppWrapper(server='localhost', port=12000).tokenize(
        sentence, return_list=False).filter(
            pos_condition=[('名詞', )]).convert_list_object()
    assert isinstance(filtered_result, list)
    print(filtered_result)
Example #2
0
class JumanppTokenizer:
    def __init__(self):
        self.tokenizer = JumanppWrapper()

    def tokenize(self, text):
        tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects
        return [
            dict(analyzed_line=obj.analyzed_line,
                 word_surface=obj.word_surface,
                 word_stem=obj.word_stem,
                 pos=list(obj.tuple_pos),
                 misc_info=obj.misc_info) for obj in tokenized_objects
        ]
Example #3
0
def basic_example_jumanpp_3x():
    # input is `unicode` type(in python3x)
    sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'

    jumanpp_wrapper = JumanppWrapper(is_use_pyknp=False)
    tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence,
                                                 normalize=True,
                                                 return_list=False)
    assert isinstance(tokenized_objects, TokenizedSenetence)
    print('-' * 30)
    print('Juman++ Demo')
    print(tokenized_objects.convert_list_object())
    for token_object in tokenized_objects.tokenized_objects:
        assert isinstance(token_object, TokenizedResult)
        print('word_stem:{}, word_surafce:{}, pos:{}'.format(
            token_object.word_stem, token_object.word_surface,
            token_object.tuple_pos))

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    HOST = 'localhost'
    PORT = 12000
    try:
        s.connect((HOST, PORT))
        s.close()
        jumanpp_wrapper = JumanppWrapper(server=HOST, port=PORT)
        tokens_list = jumanpp_wrapper.tokenize(sentence=sentence,
                                               return_list=True)
        assert isinstance(tokens_list, list)
        ### Attention: Please delete instance object of sever mode when you finished using it ###
        del jumanpp_wrapper
        # filtering is same as mecab
        filtered_result = JumanppWrapper(server=HOST, port=PORT).tokenize(
            sentence, return_list=False).filter(
                pos_condition=[(u'名詞', )]).convert_list_object()
        assert isinstance(filtered_result, list)
    except:
        logger.info(msg='Juman++ server is not running. Skip it.')
Example #4
0
 def __init__(self):
     self.tokenizer = JumanppWrapper()
Example #5
0
#parsing the Japanese stop words
count = 0
for words in fhand:
    words = words.strip('\n')
    if words not in stop_words:
        stop_words.append(words)
punctuation = list(string.punctuation)
jp_stop_words = stop_words + stopwords.words('english') + punctuation + [
    "RT", '', ' '
]

#parsing and tokenizing twitts fro the database
output = []
#result = result.fetchall()
#conn.close()
jumanpp_wrapper = JumanppWrapper()
jp_wn = 'wnjpn-all.tab'
en_swn = 'SentiWordNet_3.0.0_20130122.txt'
classifier = Sentiment()
classifier.train(en_swn, jp_wn)
for sentences in result.fetchall():

    try:
        sentences = sentences[0]
        msg_tokenized = jumanpp_wrapper.tokenize(
            sentence=sentences, is_feature=False,
            is_surface=False).convert_list_object()
        print(msg_tokenized)
        filtered_sentences = [
            w for w in msg_tokenized if not w in jp_stop_words
        ]
Example #6
0
def normalize(en_lines, ja_lines):
    """ Process lists of both English and Japanese strings.
    """
    segmenter = JumanppWrapper()
    return [[normalize_en(l1), normalize_ja(l2, segmenter)]
            for l1, l2 in zip(en_lines, ja_lines)]