def basic_example_jumanpp_3x(): # input is `unicode` type(in python3x) sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' jumanpp_wrapper = JumanppWrapper() tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence, normalize=True, return_list=False) assert isinstance(tokenized_objects, TokenizedSenetence) print('-' * 30) print('Juman++ Demo') print(tokenized_objects.convert_list_object()) for token_object in tokenized_objects.tokenized_objects: assert isinstance(token_object, TokenizedResult) print('word_stem:{}, word_surafce:{}, pos:{}'.format( token_object.word_stem, token_object.word_surface, token_object.tuple_pos)) ### You can call juman with server mode. You must start JUMAN as server mode beforehand ### jumanpp_wrapper = JumanppWrapper(server='localhost', port=12000) tokens_list = jumanpp_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokens_list, list) ### Attention: Please delete instance object of sever mode when you finished using it ### del jumanpp_wrapper # filtering is same as mecab filtered_result = JumanppWrapper(server='localhost', port=12000).tokenize( sentence, return_list=False).filter( pos_condition=[('名詞', )]).convert_list_object() assert isinstance(filtered_result, list) print(filtered_result)
class JumanppTokenizer: def __init__(self): self.tokenizer = JumanppWrapper() def tokenize(self, text): tokenized_objects = self.tokenizer.tokenize(text).tokenized_objects return [ dict(analyzed_line=obj.analyzed_line, word_surface=obj.word_surface, word_stem=obj.word_stem, pos=list(obj.tuple_pos), misc_info=obj.misc_info) for obj in tokenized_objects ]
def basic_example_jumanpp_3x(): # input is `unicode` type(in python3x) sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' jumanpp_wrapper = JumanppWrapper(is_use_pyknp=False) tokenized_objects = jumanpp_wrapper.tokenize(sentence=sentence, normalize=True, return_list=False) assert isinstance(tokenized_objects, TokenizedSenetence) print('-' * 30) print('Juman++ Demo') print(tokenized_objects.convert_list_object()) for token_object in tokenized_objects.tokenized_objects: assert isinstance(token_object, TokenizedResult) print('word_stem:{}, word_surafce:{}, pos:{}'.format( token_object.word_stem, token_object.word_surface, token_object.tuple_pos)) ### You can call juman with server mode. You must start JUMAN as server mode beforehand ### s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() jumanpp_wrapper = JumanppWrapper(server=HOST, port=PORT) tokens_list = jumanpp_wrapper.tokenize(sentence=sentence, return_list=True) assert isinstance(tokens_list, list) ### Attention: Please delete instance object of sever mode when you finished using it ### del jumanpp_wrapper # filtering is same as mecab filtered_result = JumanppWrapper(server=HOST, port=PORT).tokenize( sentence, return_list=False).filter( pos_condition=[(u'名詞', )]).convert_list_object() assert isinstance(filtered_result, list) except: logger.info(msg='Juman++ server is not running. Skip it.')
def __init__(self): self.tokenizer = JumanppWrapper()
#parsing the Japanese stop words count = 0 for words in fhand: words = words.strip('\n') if words not in stop_words: stop_words.append(words) punctuation = list(string.punctuation) jp_stop_words = stop_words + stopwords.words('english') + punctuation + [ "RT", '', ' ' ] #parsing and tokenizing twitts fro the database output = [] #result = result.fetchall() #conn.close() jumanpp_wrapper = JumanppWrapper() jp_wn = 'wnjpn-all.tab' en_swn = 'SentiWordNet_3.0.0_20130122.txt' classifier = Sentiment() classifier.train(en_swn, jp_wn) for sentences in result.fetchall(): try: sentences = sentences[0] msg_tokenized = jumanpp_wrapper.tokenize( sentence=sentences, is_feature=False, is_surface=False).convert_list_object() print(msg_tokenized) filtered_sentences = [ w for w in msg_tokenized if not w in jp_stop_words ]
def normalize(en_lines, ja_lines): """ Process lists of both English and Japanese strings. """ segmenter = JumanppWrapper() return [[normalize_en(l1), normalize_ja(l2, segmenter)] for l1, l2 in zip(en_lines, ja_lines)]