def main(): args = get_args() vocab = Vocab(args.vocab_path, args.vocab_size) # create a vocabulary hps = get_hps() if not args.data_path == "": batcher = Batcher(args.data_path, vocab, hps, args.single_pass) import pdb pdb.set_trace() x = batcher.next_batch() import pdb pdb.set_trace() pass else: with open(args.json_path) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) import pdb pdb.set_trace() pass
def json_batch(fname, hps, vocab): with open(fname) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) return b
class MecabTagger(object): """docstring, MecabTagger""" # TAGSET = set("""NNG NNP NNB NNBC NR NP VV VA VX VCP VCN MM MAG MAJ IC # JKS JKC JKG JKO JKB JKV JKQ JX JC EP EF EC ETN ETM # XPN XSN XSV XSA XR SF SE SSO SSC SC SY SL SH SN # UNKNOWN EOS""".split()) def __init__(self, **kwargs): self.tagger = MeCab(kwargs) def __enter__(self): return self def __exit__(self, type, value, traceback): del self.tagger @staticmethod def tagged_tuple(node): surface = node.surface features = node.feature.split(',') first_pos = features[0].partition('+')[0] lemma = (features[7].partition('/')[0] if features[4].startswith('Inflect') else surface.lower()) return Word(decode(surface, True), decode(lemma, True), first_pos.decode('ascii'), node.cost) def parse(self, text): # follow NLTK naming return [MecabTagger.tagged_tuple(node) for node in self.tagger.parse(text.encode(settings.DEFAULT_ENCODING), as_nodes=True) if not node.is_eos()]
class KoreanTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") def __del__(self): self.mecab_tokenizer.__del__() def __call__(self, text): dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc def detailed_tokens(self, text): # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): if node.is_eos(): break surface = node.surface feature = node.feature tag, _, expr = feature.partition(",") lemma, _, remainder = expr.partition("/") if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag}
def parse2df(text, sysdic="/usr/local/lib/mecab/dic/mecab-ipadic-neologd"): df = pd.DataFrame( index=[], columns=['文番号', '表層', '品詞1', '品詞2', '品詞3', '品詞4', '原型', 'posID']) text = text.split("\n") #改行で分割して配列にする while '' in text: #空行は削除 text.remove('') parser = MeCab("-d " + sysdic) for index, sentence in enumerate(text): nodes = parser.parse(sentence, as_nodes=True) for node in nodes: if not node.is_eos(): #品詞情報を分割 feature = node.feature.split(',') #dataframeに追加 series = pd.Series( [ index, #文番号 node.surface, #表層 feature[0], #品詞1 feature[1], #品詞2 feature[2], #品詞3 feature[3], #品詞4 feature[6], #原型 node.posid #品詞番号 ], index=df.columns) df = df.append(series, ignore_index=True) return df
def text_segmentation_and_pronunciation(sender, instance, *args, **kwargs): if instance.language == 'zh-hans' or instance.language == 'zh-hant': # seperate Chinese words by spaces import jieba # from config.settings.base import JIEBA_DICT_PATH # jieba.set_dictionary(JIEBA_DICT_PATH) def cut(s): return jieba.cut(s, cut_all=False) seg_list = cut(instance.body) instance.body = " ".join(seg_list) # # generate Pinyin # import ChineseTone as ct # # pinyin_list = ct.PinyinHelper.convertToPinyinFromSentence(instance.body, pinyinFormat=ct.PinyinFormat.WITH_TONE_MARK, segment=cut) # # instance.pronunciation = ' '.join(pinyin_list) instance.save() elif instance.language == 'ja': # seperate Japanese words by spaces from natto import MeCab nm = MeCab('-Owakati') output = nm.parse(instance.body) instance.body = output instance.save()
def nlp(data): points = 0 nm = MeCab() negaposi_dic = getNegaPosiDic() sentenses = re.split('[。!!♪♫★☆>??()w]', data) try: for sentense in sentenses: negaposi = 0 result_all = nm.parse(sentense) result_words = result_all.split('\n')[:-1] for word in result_words: try: word_toarray = re.split('[\t,]', word) if word_toarray[7] in negaposi_dic: negaposi = int(negaposi_dic[word_toarray[7]]) print(word_toarray[7], negaposi_dic[word_toarray[7]], \ flush=True) except Exception as e: print('%r' % e, flush=True) points += negaposi except Exception as e: print('%r' % e, flush=True) print(data, flush=True) return points
def nlp(data): nm = MeCab() # nmというMeCabクラスのインスタンスを作成 points = 0 # 文章全体の評価 negaposi_dic = getNegaPosiDic() # 評価データの読み込み(さっき作った関数を呼び出している。) sentenses = re.split("[。!!♪♫★☆>??()w]", data) # 一文ごとに分ける try: for sentense in sentenses: # 文の数だけ繰り返す negaposi = 0 result_all = nm.parse(sentense) # 形態素解析して品詞分解をしている。 result_words = result_all.split("\n")[:-1] # 単語ごとに分ける for word in result_words: try: word_toarray = re.split('[\t,]', word) if word_toarray[7] in negaposi_dic: negaposi = int( negaposi_dic[word_toarray[7]]) # その文のネガポジ print(word_toarray[7], negaposi_dic[word_toarray[7]], flush=True) # 評価リストに入っていたワードとその評価 except Exception as e: print('%r' % e, flush=True) points += negaposi # 文章全体の評価に加算 except Exception as e: print('%r' % e, flush=True) print(data, flush=True) return points # 文章全体の値を返す。
def mecab_analysis(text): import os mecab_flags = [ f'-d {os.popen("mecab-config --dicdir").read().strip()}/mecab-ipadic-neologd/', '-u username.dic', ] t = MeCab(' '.join(mecab_flags)) enc_text = text.strip() # MeCabに渡した文字列は必ず変数に入れておく https://shogo82148.github.io/blog/2012/12/15/mecab-python/ t.parse('') # UnicodeDecodeError対策 http://taka-say.hateblo.jp/entry/2015/06/24/183748 # node = t.parseToNode(enc_text) output = [] for node in t.parse(enc_text, as_nodes=True): if node.surface != "": # ヘッダとフッタを除外 word_type = node.feature.split(",")[0] if word_type in ["形容詞", "名詞", "副詞"]: output.append(node.surface) return output
def main_mecab(): mc = MeCab() with open('./data/neko.txt', 'r') as f: with open(save_path, 'w') as fw: for line in f: for one_sent in line.strip().split(): # 1行に2 sentence 以上ある場合 fw.write(mc.parse(one_sent.strip()) + "\n") print("save mecab file {}".format(save_path))
def keitaiso_kaiseki(self,sentence): nm = MeCab() terms = [] for node in nm.parse(sentence, as_nodes=True): list = node.feature.split(',') if list[0] == '名詞'or list[0] == '形容詞': terms.append(node.surface) return terms
def word_frequencies(text): from manabi.apps.reading_level.word_frequencies import WORD_FREQUENCIES mecab = MeCab() frequencies = [] for node in mecab.parse(text.encode('utf8'), as_nodes=True): frequency = WORD_FREQUENCIES.get(node.surface.decode('utf8')) if frequency is None: continue frequencies.append(frequency) return frequencies
class MeCabParser(object): def __init__(self): if os.name == 'nt': self.engine = RemoteMecabParser() else: self.engine = MeCab() def parse(self, text): masked_text, urls = self.url_mask(text) masked_text, figures = self.figure_mask(masked_text) masked_text, digits = self.digit_mask(masked_text) mc_ret = self.engine.parse(masked_text) mc_lines = mc_ret.split('\n') dic = {} for line in mc_lines: s = line.split('\t') if len(s) >= 2: buf = s[1].split(',') meta = {'part1': buf[0], 'part2': buf[1], 'part3': buf[2]} if len(buf) > 9: meta['add1'] = buf[9] key = s[0] if key == 'PACMECABURL': key = urls.pop(0) if key == 'PACMECABFIGURE': key = figures.pop(0) if key == 'PACMECABDIGIT': key = digits.pop(0) dic[key] = meta # 同じwordはキーとしてdistinctされる return dic def url_mask(self, text): match = re.findall(PTN_URL, text) for m in match: text = text.replace(m, 'PACMECABURL') return text, match def figure_mask(self, text): match_list = [] for ptn in PTN_FIGURE: match = re.findall(ptn, text) for m in match: text = text.replace(m, 'PACMECABFIGURE') match_list += match return text, match_list def digit_mask(self, text): match = re.findall('\d{5,}', text) for m in match: text = text.replace(m, 'PACMECABDIGIT') return text, match
def parse_word_list(text): """ 文章を単語のリストに変換して返す """ words = list() nm = MeCab() with MeCab('-F%m,%f[0]') as nm: for n in nm.parse(text, as_nodes=True): node = n.feature.split(','); if node[0] != 'EOS' and is_valid_speech(node[1]): words.append(node[0]) return words
class MeCabTokenizer: DEFAULT_DICTIONARY = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd" def __init__(self, dic=DEFAULT_DICTIONARY): self._mecab = MeCab(f"-d {dic} -F%f[0],%f[1],%f[2],%f[3],%f[6]") self._tokenizer = tf.keras.preprocessing.text.Tokenizer() def tokenize(self, text): tokens = [] for node in self._mecab.parse(text, as_nodes=True): if node.is_eos(): continue feature = node.feature.split(",") part_of_speech, lemma = feature[0:4], feature[4] if part_of_speech[0] not in ["名詞", "動詞", "形容詞"]: continue if part_of_speech[0:2] == ["名詞", "数"]: continue tokens.append(lemma) return " ".join(tokens) def fit_on_texts(self, texts): texts = [ self.tokenize(text) for text in progress.track(texts, description="Fitting on texts...") ] self._tokenizer.fit_on_texts(texts) return self._tokenizer.texts_to_sequences(texts) def texts_to_matrix(self, texts): texts = [self.tokenize(text) for text in texts] return self._tokenizer.texts_to_matrix(texts, mode="tfidf") def sequences_to_matrix(self, sequences): return self._tokenizer.sequences_to_matrix(sequences, mode="tfidf") def save(self, path): with open(path, "w") as f: f.write(self._tokenizer.to_json()) def load(self, path): with open(path) as f: self._tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( f.read())
def from_sentence(klass, sentence: str, mecab: MeCab) -> 'Iterable[Word]': normalized = jaconv.normalize(sentence) for mec_node in mecab.parse(normalized, as_nodes=True): if mec_node.is_eos(): break res = AnalyzeMorp(mec_node) if res.is_symbol(): continue if TagWord.is_include(res.surface()): yield TagWord(res.surface()) else: yield Word(surface=res.surface(), yomi=res.yomi())
def txt2words(txt) -> list: posid = [ 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 66, 67, 2, 31, 36, 10, 34 ] words = [] parser = MeCab() nodes = parser.parse(txt, as_nodes=True) for node in nodes: if not node.is_eos(): feature = node.feature.split(',') if node.posid in posid and feature[6] != "*": words.append(feature[6]) return words
class MeCabTokenizer(BaseTokenizer): def __init__( self, user_dictionary_path: Optional[str] = None, system_dictionary_path: Optional[str] = None, dictionary_format: Optional[str] = None, ) -> None: from natto import MeCab super().__init__(name="mecab") options = [] if isinstance(user_dictionary_path, str): options.append("-u {}".format(user_dictionary_path)) if isinstance(system_dictionary_path, str): options.append("-d {}".format(system_dictionary_path)) self._tokenizer = MeCab(" ".join(options)) # If dictionary format is not specified, # konoha detects it by checking a name of system dictionary. # For instance, system_dictionary_path=mecab-ipadic-xxxx -> ipadic and # system_dictionary_path=mecab-unidic-xxxx -> unidic. # If system_dictionary_path and dictionary_format are not given, # konoha assumes it uses mecab-ipadic (de facto standard). # Currently, konoha only supports ipadic. (TODO: unidic) if dictionary_format is None: if system_dictionary_path is None or "ipadic" in system_dictionary_path.lower( ): self._parse_feature = parse_feature_for_ipadic elif "unidic" in system_dictionary_path.lower(): self._parse_feature = parse_feature_for_unidic else: raise ValueError( f"Unsupported system dictionary: {system_dictionary_path}") else: if "ipadic" == dictionary_format.lower(): self._parse_feature = parse_feature_for_ipadic elif "unidic" == dictionary_format.lower(): self._parse_feature = parse_feature_for_unidic else: raise ValueError( f"Unsupported dictionary format: {dictionary_format}") def tokenize(self, text: str) -> List[Token]: return_result = [] parse_result = self._tokenizer.parse(text).rstrip(" ") for elem in parse_result.split("\n")[:-1]: return_result.append(self._parse_feature(elem)) return return_result
def get_tokens(text): mecab = MeCab() tokens = [] pos_word_dict = {} for t in text: res_raw = mecab.parse(t.encode('utf-8')) res = [r.split('\t') for r in res_raw.split('\n')] res = [r for r in res if len(r) == 2] res = [[r[0], r[1].split(',')[0]] for r in res] for r in res: if r[1] in pos_word_dict: pos_word_dict[r[1]].append(r[0]) else: pos_word_dict[r[1]] = [r[0]] tokens.append(' '.join([r[0] for r in res])) return tokens, pos_word_dict
def main(text): """ MeCabで分かち書きした後に作成したモデルを読み込み、判定 MeCabのneologdの保存されているpathはmacなら大抵ここになるはずではあるが、エラーが出た際は調べて修正してください。 """ nm = MeCab("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd") words = nm.parse(text) print('\n', words) classifier = ft.load_model('./model.bin') estimate = classifier.predict([words], k=2) estimate_2 = classifier.predict_proba([words], k=2) print('estimate:', estimate_2[0][0][1]) if estimate[0][0] == '__label__2,': return ['ネガティブ', str(estimate_2[0][0][1])] elif estimate[0][0] == '__label__1,': return ['ポジティブ', str(estimate_2[0][0][1])]
def split_into_words(doc, name=''): # mecab = MeCab.Tagger("-Ochasen") #形態素解析 mecab = MeCab("-Ochasen") #作品部分だけ抽出 valid_doc = trim_doc(doc) #単語ごとに分割(linesはlist) lines = mecab.parse(doc).splitlines() words = [] for line in lines: #水平タブごとに分割 chunks = line.split('\t') if len(chunks) > 3 and ( chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))): #要は単語を抽出している words.append(chunks[0]) return LabeledSentence(words=words, tags=[name])
def parse2df(text, sysdic="/usr/local/lib/mecab/dic/naist-jdic"): """文毎に形態素解析を行い、結果をdataframeに格納して返す Args: text:形態素解析対象のテキスト Returns: 形態素解析結果を格納したdataframe カラムは['文番号','表層', '品詞1','品詞2','品詞3','品詞4','原型','posID'] """ # 結果格納用の空のDataFrame df = pd.DataFrame( index=[], columns=['文番号', '表層', '品詞1', '品詞2', '品詞3', '品詞4', '原型', 'posID']) text = re.sub(r".", "。\n", text) text = re.sub(r"。", "。\n", text) text = text.split("\n") # 改行で分割して配列にする while '' in text: # 空行は削除 text.remove('') parser = MeCab("-d " + sysdic) for index, sentence in enumerate(text): logging.debug(sentence) nodes = parser.parse(sentence, as_nodes=True) for node in nodes: if not node.is_eos(): # 品詞情報を分割 feature = node.feature.split(',') # dataframeに追加 series = pd.Series( [ index, # 文番号 node.surface, # 表層 feature[0], # 品詞1 feature[1], # 品詞2 feature[2], # 品詞3 feature[3], # 品詞4 feature[6], # 原型 node.posid # 品詞番号 ], index=df.columns) df = df.append(series, ignore_index=True) logging.debug("End : parse2df") return df
def process(x): tokenized_text = [] token_count = 0 no_parsing = get_no_parsing() known_words = get_knowledge("moi") words = {} nm = MeCab("-Owakati") for line in x.readlines(): for n in nm.parse(line, as_nodes=True): tokenized_text.append(n.surface) if n.surface not in known_words and n.surface not in no_parsing: words[n.surface] = 0 elif n.surface in known_words: words[n.surface] = known_words[n.surface] token_count += 1 tokenized_text.append(u'<br>') return {'tokenized_text': tokenized_text, 'words': words, 'token_count': str(token_count)}
class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab MeCab = try_mecab_import() # type: ignore[func-returns-value] self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") def __reduce__(self): return KoreanTokenizer, (self.vocab,) def __del__(self): self.mecab_tokenizer.__del__() def __call__(self, text: str) -> Doc: dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) token.pos = TAG_MAP[token.tag_][POS] token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): if node.is_eos(): break surface = node.surface feature = node.feature tag, _, expr = feature.partition(",") lemma, _, remainder = expr.partition("/") if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} def score(self, examples): validate_examples(examples, "KoreanTokenizer.score") return Scorer.score_tokenization(examples)
def run_ma(text, stop_path='', nBest=1): """ Returns the dataframe of all Information of morpheme analyzer. - input : string, {stopword file path}, {nbest number} - output : dataframe """ options = r'-F%m,%f[0],%f[1],%f[2],%f[3],%f[4],%f[5],%f[6],%f[7]\n' options += " -N" + str(nBest) stopword_flag = False if stop_path != '': stopword_flag = True try: _me = MeCab(options) _df = pd.DataFrame(None, columns=[ 'surface', 'tag', 'meaning_class', 'final_consonant', 'reading', 'type', 'first_tag', 'final_tag', 'expression' ]) if stopword_flag: trie = load_stopword(stop_path) i = 0 for term_str in str(_me.parse(text)).split('\n'): term_list = re.split(',', term_str) if stopword_flag == True and is_stopword(term_list[0], trie): continue if len(term_list) < 2: continue _df.loc[i] = term_list i += 1 except Exception as e: print("[run_ma] messages of error : ", e) return _me, _df
def parseText(text, sysdic='/usr/local/lib/mecab/dic/mecab-ipadic-neologd'): text = text.split("\n") #改行で分割して配列にする while '' in text: #空行は削除 text.remove('') parser = MeCab("-d " + sysdic) lst = [] for sentence in text: logging.debug(sentence) nodes = parser.parse(sentence, as_nodes=True) for node in nodes: features = node.feature.split(',') parts = features[0] if parts == '名詞': lst.append(node.surface) if parts in {'動詞', '形容詞', '形容動詞'}: lst.append(features[6]) return lst
class Parser(object): def __init__(self): self.mc = MeCab() def parse_sentence(self, line): ''' We receive a line of japanese text, pass it through Mecab morphological analyzer and include furigana when needed (only for kanjis). ''' new_line = '' for node in self.mc.parse(line, as_nodes=True): if not node.is_eos(): word = Word(node) if word.kanji: new_line += '{}[{}]'.format(word.content, word.furigana) else: new_line += word.content return new_line
def dissasembly(self, tweet_data: pd.DataFrame): nm = MeCab('-Owakati') first_word = [] word_ls = [] for t in range(len(tweet_data)): content = tweet_data.values[t].item() if '@' in content or '時報' in content or 'http' in content: pass else: parsed_content = nm.parse(content) ls = list(parsed_content.split()) first_word.append(ls[0] + ',') word_ls.append(ls[1:]) df_f = pd.DataFrame(first_word) df_w = pd.DataFrame(word_ls) df_f.to_csv(path['first_word'], index=False) df_w.to_csv(path['words'], index=False)
class mecab_owakatikun(Twitter_syusyukun, MeCab): def owakatikun(self): self.nm = MeCab('-Owakati') self.result = '' self.syusyu(auth) self.tweet_ls = self.nm.parse(str(self.df.values)) i = len(self.tweet_ls) for h in range(i): if '@' in str(self.tweet_ls[h]): h += 1 elif '時報' in str(self.tweet_ls[h]): h += 1 elif 'RT' in str(self.tweet_ls[h]): h += 1 else: self.result += self.tweet_ls[h] h += 1 self.write_txt = ''.join(self.result) with open('/mnt/c/users/user/awesome/my_ai/tweets.txt', 'a') as f: f.write(str(self.write_txt) + '\n') f.close print(self.write_txt)
def main(): nm = MeCab('-Owakati') word = "MeCabは 京都大学情報学研究科−日本電信電話株式会社コミュニケーション科学基礎研究所 共同研究ユニットプロジェクトを通じて開発されたオープンソース 形態素解析エンジンです。 言語, 辞書,コーパスに依存しない汎用的な設計を 基本方針としています。 パラメータの推定に Conditional Random Fields (CRF) を用 いており, ChaSenが採用している 隠れマルコフモデルに比べ性能が向上しています。また、平均的に ChaSen, Juman, KAKASIより高速に動作します。 ちなみに和布蕪(めかぶ)は, 作者の好物です。" print(nm.parse(word)) lis = [n.surface for n in nm.parse(word, as_nodes=True) if n.is_nor()] print(lis)
nm = MeCab() for doc_id, syubunPart in rows: print "--------------" print "id:", doc_id # 改行、空白削除 syubunPart = re.sub(r'(\n|\t| | )', '', syubunPart) # 文分割 sensp = sensplit.SenSplit(syubunPart) syubun_list = sensp() for sentence in syubun_list: if sentence == '': continue morph_list = [] # 文を形態素で分割したリスト sentence = sentence.encode('utf_8') # unicode→str(utf-8) for n in nm.parse(sentence, as_nodes=True): if not n.is_eos(): # print n.surface morph_list.append(n.surface) x = [] y = [] for i in range(0, len(morph_list)): if i == 0: x.append('<BOS>') y.append(morph_list[i]) elif i == len(morph_list)-1: x.append(morph_list[i]) y.append('<EOS>') else: x.append(morph_list[i]) y.append(morph_list[i+1])
from gensim import corpora, matutils mc = MeCab() txt_word_list = [] # テキストファイルを格納しているフォルダを読み込み files = os.listdir(os.path.dirname(__file__)+'/path/txt') # フォルダ配下のテキストファイルを1つずつ読み込み for file in files: # テキストファイルから名詞と動詞の単語を取り出したリスト作成(Q11-1の処理と同じ) with open(os.path.dirname(__file__) + '/path/txt/'+file, 'r') as f: txt = f.read() word_list = [] for n in mc.parse(txt, as_nodes=True): if not (n.is_bos() or n.is_eos()): part, word = n.feature.split(',', 1) if part == "名詞" or part == "動詞": word_list.append(n.surface) # テキストファイルごとの単語リストを追加 txt_word_list.append(word_list) # bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成 corpus_dic = corpora.Dictionary(txt_word_list) # 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換 corpus_list = [corpus_dic.doc2bow(word_in_text) for word_in_text in txt_word_list] # コーパスリストをスパースマトリックス(csc型)に変換
# MeCabをPythonで利用するためのライブラリ読み込み import os from natto import MeCab # merosには、メロスの文章データが格納 # MeCabを実行するオブジェクトを生成 mc = MeCab() # 下記のコードはテキスト時は、下記のようにする with open(os.path.dirname(__file__) + '/path/txt/meros.txt', 'r') as f: txt = f.read() word_list = [] # MeCabを用いて、形態素解析を実行 for part_and_word in mc.parse(txt, as_nodes=True): # 形態素解析結果のpart_and_wordが開始/終了オブジェクトでないことを判定 if not (part_and_word.is_bos() or part_and_word.is_eos()): # 形態素解析結果から品詞と単語を取得 part, word = part_and_word.feature.split(',', 1) # 名詞と動詞の単語を抽出 if part == '名詞' or part == '動詞': word_list.append(part_and_word.surface)