Exemple #1
0
class MorphAnalysis:
    def __init__(self):
        self.stop_path = str(pathlib.Path(
            __file__).resolve().parent) + '/data/stopwords_slothlib.txt'
        self.stopwords = []
        with open(self.stop_path, 'r') as f:
            self.stopwords = f.read().split()

        # 形態素解析
        self.jumanpp = Jumanpp()

    def to_wakati(self,
                  text,
                  allow_word_class=[
                      '名詞', '指示詞', '動詞', '形容詞', '判定詞', '助動詞', '副詞', '助詞',
                      '接続詞', '連体詞', '感動詞', '接頭辞', '特殊', '未定義語'
                  ],
                  remove_stopwords=False,
                  genkei=False):
        wkt = ""
        text = mojimoji.han_to_zen(text)
        rst = self.jumanpp.analysis(text)
        for mrph in rst.mrph_list():
            # midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname
            if remove_stopwords and (mrph.genkei in self.stopwords):
                continue
            if mrph.hinsi in allow_word_class:
                if genkei:
                    wkt += mrph.genkei + ' '
                else:
                    wkt += mrph.midasi + ' '
        return wkt
Exemple #2
0
def get_keitaiso_list_from_juman(text):
    """
    textを形態素解析して返す
    mecabでできない表記揺れの問題をjumanだと解決できる
    """

    jumanpp = Jumanpp()
    keitaiso_list = []
    hinshi_list = []
    exclusive_word_list = get_exclusive_word_list()

    # スペースがあるとエラー。先頭に#があると処理が動かなくなる(なんでだろう)
    text = text.replace(" ", "").replace(" ", "").replace("#", "/")

    result = jumanpp.analysis(unicode(text, 'utf-8')) # pyknp-Jumanではユニコード文字列しか処理されない
    try:
        for mrph in result.mrph_list():
            keitaiso = mrph.genkei.encode('utf-8')
            hinshi = mrph.hinsi.encode('utf-8')
            # 形態素が設定した品詞リストやゴミワードリストに含まれるとき、数字のときにはスキップ
            if not is_valid_word_class(hinshi) or keitaiso in exclusive_word_list or keitaiso.isdigit():
                continue

            keitaiso_list.append(keitaiso)
            hinshi_list.append(hinshi)
    except:
        print traceback.print_exc()

    return [keitaiso_list, hinshi_list]
def append_repname(words):
    """

    :param words: a list of Word instances
    :return: a list of Word instances with preprocessed words
             with the representative expressions

    """
    n_word = len(words)
    juman = Jumanpp()
    bar = progressbar.ProgressBar()
    for i in bar(range(n_word), max_value=n_word):
        word = words[i]

        if word.uid != i:
            continue  # already merged

        repname_set = []
        r = juman.analysis(word.p_surface)
        for mrph in r.mrph_list():
            if mrph.bunrui == '数詞':
                repname_set.append([kansuji2arabic(mrph.midasi)])
            elif mrph.repnames() != '':
                repname_set.append(mrph.repnames().split('?'))
            else:
                repname_set.append([mrph.midasi])
        words[i].alias.extend(expand_ambiguity(repname_set))
    return words
Exemple #4
0
def main():
    if len(sys.argv) != 2:
        print('need one argument for a file.')
        return

    file_name = sys.argv[1]
    vocab_dict = defaultdict(int)
    juman = Jumanpp()

    with open(file_name, 'r', encoding='utf-8', newline='') as fr:
        text = fr.readlines()

        for line in text:
            # juman++ doesn't support half-width character
            line = line.replace(' ', ' ')
            line.translate(
                str.maketrans(
                    {chr(0x0021 + i): chr(0xFF01 + i)
                     for i in range(94)}))
            analysis = juman.analysis(line.replace('\n', ''))
            for m in analysis.mrph_list():
                vocab_dict[str(m.midasi)] += 1

    sorted_dict = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)

    print(sorted_dict)
    print(len(sorted_dict))
Exemple #5
0
def read_and_anlyze_text():
    sys.stdin = codecs.getreader('utf_8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    jumanpp = Jumanpp()
    midasis = []
    repnames = []
    repname_counts = {}
    wikipedia_redirections = []
    w_rs = []
    w_r_counts = {}
    row_result = []
    while True:
        input_ = sys.stdin.readline()
        if input_ == '':
            break
        else:
            input_ = input_.strip()
            if input_ == '':
                continue
            result = jumanpp.analysis(input_)
            for mrph in result.mrph_list():
                if not repname_counts.has_key(mrph.repname):
                    repname_counts[mrph.repname] = 0
                if (not mrph.midasi in midasis) and (mrph.repname != u""):
                    repname_counts[mrph.repname] += 1
                w_r = get_wikipedia_redirection(mrph.imis)
                if not w_r:
                    w_r = mrph.midasi
                if not w_r_counts.has_key(w_r):
                    w_r_counts[w_r] = 0
                if (not mrph.midasi in midasis):
                    w_r_counts[w_r] += 1
                midasis.append(mrph.midasi)
                repnames.append(mrph.repname)
                wikipedia_redirections.append(w_r)
                w_rs.append(w_r)
            midasis.append("\n")
            repnames.append("\n")
            wikipedia_redirections.append(None)
            w_rs.append("\n")
            repname_counts["\n"] = 0
            w_r_counts["\n"] = 0
            row_result.append(result.spec())

    yure_result = []
    for i, midasi in enumerate(midasis):
        yure = False
        if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1:
            yure = True
        yure_result.append({
            "midasi": midasi,
            "repname": repnames[i],
            "wikipedia_redirection": wikipedia_redirections[i],
            "repname_count": repname_counts[repnames[i]],
            "w_r_count": w_r_counts[w_rs[i]],
            "yure": yure
        })
    return row_result, yure_result
Exemple #6
0
 def analysis_text(self, text, debug=None):
     jumanpp = Jumanpp()
     #There may be unknown error in jumanpp. what...
     try:
         result = jumanpp.analysis(text)
     except:
         return None
     if debug: self.__print_analyzed(result)
     return result
Exemple #7
0
def read_and_anlyze_text():
    sys.stdin = codecs.getreader('utf_8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    jumanpp = Jumanpp()
    midasis = []
    repnames = []
    repname_counts = {}
    wikipedia_redirections = []
    w_rs = []
    w_r_counts = {}
    row_result = []
    while True:
        input_ = sys.stdin.readline()
        if input_ == '' :
            break
        else :
            input_ = input_.strip()
            if input_ == '' :
                continue
            result = jumanpp.analysis(input_)
            for mrph in result.mrph_list():
                if not repname_counts.has_key(mrph.repname):
                    repname_counts[mrph.repname] = 0
                if (not mrph.midasi in midasis) and (mrph.repname != u"") :
                    repname_counts[mrph.repname] += 1
                w_r = get_wikipedia_redirection(mrph.imis)
                if not w_r :
                    w_r = mrph.midasi
                if not w_r_counts.has_key(w_r):
                    w_r_counts[w_r] = 0
                if (not mrph.midasi in midasis):
                    w_r_counts[w_r] += 1
                midasis.append(mrph.midasi)
                repnames.append(mrph.repname)
                wikipedia_redirections.append(w_r)
                w_rs.append(w_r)
            midasis.append("\n")
            repnames.append("\n")
            wikipedia_redirections.append(None)
            w_rs.append("\n")
            repname_counts["\n"] = 0
            w_r_counts["\n"] = 0
            row_result.append(result.spec())

    yure_result = []
    for i, midasi in enumerate(midasis):
        yure = False
        if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1:
            yure = True
        yure_result.append({"midasi":midasi,
            "repname": repnames[i],
            "wikipedia_redirection": wikipedia_redirections[i],
            "repname_count": repname_counts[repnames[i]],
            "w_r_count": w_r_counts[w_rs[i]],
            "yure": yure})
    return row_result, yure_result
def segment(texts):
    jumanpp = Jumanpp()
    results = {}
    for text in texts:
        try:
            parsed = jumanpp.analysis(han_to_zen(neologdn.normalize(text)))
            segmented = ' '.join(m.midasi for m in parsed.mrph_list())
            results[text] = segmented
        except Exception:
            pdb.set_trace()
            logger.warning('Cannot parse {}'.format(text))
            continue
    return results
Exemple #9
0
def parser_juman(text):
    from pyknp import Jumanpp
    jumanpp = Jumanpp()

    result = jumanpp.analysis(text)
    words = []

    for n in result.mrph_list():
        if n.hinsi != '助詞' and n.hinsi != '助動詞' and n.hinsi != '特殊' and n.bunrui != "空白":
            if n.hinsi == '動詞':
                words.append(n.genkei)
            else:
                words.append(n.midasi)
    return words
Exemple #10
0
def segment_ja(texts, flag_keep_number=False):
    jumanpp = Jumanpp()
    results = {}
    for text in texts:
        try:
            parsed = jumanpp.analysis(han_to_zen(text))
            if flag_keep_number:
                segmented = ' '.join(m.midasi for m in parsed.mrph_list())
            else:
                segmented = ' '.join('<数詞>' if m.bunrui == '数詞' else m.midasi
                                     for m in parsed.mrph_list())
            results[text] = segmented
        except Exception:
            pdb.set_trace()
            logger.warning('Cannot parse {}'.format(text))
            continue
    return results
Exemple #11
0
def main():
    model_w2v = gensim.models.KeyedVectors.load_word2vec_format(
        "/share/data/word2vec/2016.08.02/w2v.midasi.256.100K.bin",
        binary=True,
        unicode_errors='ignore')
    word2index = {w: i for i, w in enumerate(model_w2v.index2word)}

    model = BiLSTM(embed_mat=model_w2v.vectors, mid_size=128)
    serializers.load_npz("BiLSTM_attention.model", model)

    # 標準入力からテストできるように
    jumanpp = Jumanpp()
    while True:
        input_sentence = sys.stdin.readline()  # 改行を含む, string型
        result = jumanpp.analysis(input_sentence)
        doc = [mrph.midasi for mrph in result.mrph_list()]
        x = [doc2list(doc, word2index)]
        #        x = list2Var([doc2vec(doc)], np.float32, False)
        with chainer.using_config("train", False):
            y, attn_list = model.predict(x)

        p = np.argmax(y[0].data)
        doc_class = ["新聞記事", "  雑誌  ", " 教科書 ", " ブログ "]
        print("")
        print("*------------------------*")
        print("|                        |")
        print("|        " + doc_class[p] + "        |")
        print("|                        |")
        print("*------------------------*")
        print("")

        prob = F.softmax(y, axis=1)[0].data
        print("新聞記事: {:.6f}  雑誌: {:.6f}  教科書: {:.6f}  ブログ: {:.6f}".format(
            prob[0], prob[1], prob[2], prob[3]))

        for word, attn in sorted(zip(doc, attn_list),
                                 key=lambda x: x[1],
                                 reverse=True):
            print(word, end=", ")
        print("\n")
def parse(line):
    if line == None:
        return
    if line == "\n":
        return

    jumanpp = Jumanpp()

    replaced = re.sub('\n|\u3000| ', '', line)
    result = jumanpp.analysis(replaced)

    words = []

    for mrph in result.mrph_list():

        if not mrph == None:
            print('{0}                読み: {1}  品詞: {2}  活用1: {3}  活用2: {4}'.
                  format(mrph.midasi, mrph.yomi, mrph.hinsi, mrph.katuyou1,
                         mrph.katuyou2))
            words.append(mrph.midasi)

    return words
Exemple #13
0
class JumanParser(Parser):
    def __init__(self):
        super().__init__()
        remove_pattern = r'・|、|\,|\.| | '
        self.remove_compiled = re.compile(remove_pattern)
        self.analyzer = Jumanpp()

    def parse(self, message):
        for sent in message.sentences:
            sent.text = self.remove_compiled.sub('', sent.text)
            parsed = self.analyzer.analysis(sent.text)
            mrph_list = parsed.mrph_list()
            sent.bag = self.create_bags(mrph_list)
            message.bags += sent.bag
        return message

    @staticmethod
    def create_bags(mrph_list):
        bag = []
        for mrph in mrph_list:
            if mrph.hinsi == '名詞' or mrph.hinsi == '動詞':
                bag.append(mrph.genkei)
        return bag
Exemple #14
0
class IntentSlotDatasetReader(DatasetReader):
    def __init__(self, lazy=False, max_tokens=64):
        super().__init__(lazy)
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
        self.jumanpp = Jumanpp()

    def _read(self, file_path):
        with open(file_path, 'r') as f:
            for line in f:
                line = line.strip().split()
                label = line[-1]
                line = [tt.split(':') for tt in line[:-2]]
                text = [Token(tt[0]) for tt in line][0:self.max_tokens]
                tags = [tt[1] for tt in line][0:self.max_tokens]
                yield self.text_to_instance(text, label, tags)

    def tokenizer(self, text):
        text = [
            Token(mrph.midasi)
            for mrph in self.jumanpp.analysis(text).mrph_list()
        ][0:self.max_tokens]
        return text

    def text_to_instance(self, text, label=None, tags=None):
        text_field = TextField(text, self.token_indexers)
        fields = {'text': text_field}
        if label:
            label_field = LabelField(label, label_namespace='labels')
            fields['label'] = label_field
        if tags:
            tags_field = SequenceLabelField(tags,
                                            text_field,
                                            label_namespace='tags')
            fields['tag'] = tags_field
        return Instance(fields)
Exemple #15
0
# -*- coding: utf-8 -*-
from pyknp import Jumanpp
import sys
import codecs

j = Jumanpp()
line = sys.stdin.readline()
if sys.version[0] == str(2):
    result = j.analysis(line.decode("utf-8"))
else:
    result = j.analysis(line)
for mrph in result.mrph_list():
    print(mrph.midasi)
Exemple #16
0
def main():

    Topic = []
    Utterance = []
    Relevance = []
    ID = []

    regex = u'[^ぁ-ん]+'

    #学習用データ form[label, Topic & Utterce]
    #wf_Data = open("Tpc&UTRtEST.csv","w")

    all_filepaths = glob.glob('./testGS/*')
    for filepath in all_filepaths:
        lines = [
            line.rstrip() for line in fileinput.input(
                filepath, openhook=fileinput.hook_encoded('utf-8'))
        ]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])

        for argument in arguments:
            ID.append(argument["ID"])
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Relevance"])

    TrueDataset = {}
    correctAnswer_0 = 0
    correctAnswer_1 = 0
    incorrectAnswer_0 = 0
    incorrectAnswer_1 = 0

    for line in list(set(Utterance)):
        T_List = []
        R_list = []
        id_tag = 0
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                T_List.append(Topic[line_l])
                R_list.append(Relevance[line_l])
                id_tag = ID[line_l]
        TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line + ":" +
                    id_tag] = str(Counter(R_list).most_common()[0][0])

    sorted(TrueDataset.items())

    # Analyze Utterance using Juman++ & knp
    jumanpp = Jumanpp()
    with open("CommonWords.csv", "w") as wf:
        wf.write("label,A,B\n")
        line_cnt = len(TrueDataset)
        now_line_cnt = 0
        for key, label in TrueDataset.items():
            tpc, utr, id = key.split(":")[0], key.split(":")[1], key.split(
                ":")[2]
            topANDutrANDlabelList = []

            #parse Topic
            topic_analyed_List = []
            topANDutrANDlabelList.append("Topic")
            try:
                #0.7909880035111675
                #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0]
                #topic_result = jumanpp.analysis(s)
                topic_result = jumanpp.analysis(format_text(tpc))
                #print(s)
                for mrph in topic_result.mrph_list():
                    try:
                        if len(re.findall(regex, mrph.genkei)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                if "数量" in mrph.imis:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append("[数]")
                                else:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append(mrph.genkei)
                    except:
                        continue
            except:
                continue

        #parse Utterance
            utter_analyed_List = []
            topANDutrANDlabelList.append("Utterance")
            try:
                if "、" in utr:
                    utrList = utr.split("、")
                    for sentence in utrList:

                        #reigi
                        if sentence == "":
                            continue

                        utter_result = jumanpp.analysis(sentence)
                        for mrph in utter_result.mrph_list():
                            try:
                                if len(re.findall(regex, mrph.genkei)) > 0:
                                    if ("名詞" in mrph.hinsi
                                            or "動詞" in mrph.hinsi):
                                        if "数量" in mrph.imis:
                                            utter_analyed_List.append(
                                                mrph.genkei)
                                            topANDutrANDlabelList.append("[数]")
                                        else:
                                            utter_analyed_List.append(
                                                mrph.genkei)
                                            topANDutrANDlabelList.append(
                                                mrph.genkei)

                                else:
                                    continue
                            except:
                                print("error")
                                continue

                else:
                    utter_result = jumanpp.analysis(utr)
                    for mrph in utter_result.mrph_list():
                        try:
                            if len(re.findall(regex, mrph.genkei)) > 0:
                                if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                    if "数量" in mrph.imis:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append("[数]")
                                    else:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append(
                                            mrph.genkei)
                        except:
                            print("error")
                            continue
                topANDutrANDlabelList.append("END")

            except:
                print("error")
                continue

            #if "END" in topANDutrANDlabelList:
            #wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1]) + "\n")#+ " [---] " + "{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "null" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null}\n")
#            if "END" in topANDutrANDlabelList:
#                wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+ " [---] " + "{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "null" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null}\n")

            if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
                #wf.write("{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "1" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null},\n")
                wf.write(str(label) + ",1," + str(1) + "\n")
            else:
                wf.write(str(label) + ",1," + str(0) + "\n")
                #wf.write("{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "0" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null},\n")

#            if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
#                if int(label) == 1:
#                    correctAnswer_1 += 1
#                else:
#                    incorrectAnswer_1 += 1
#
#            else:
#                if int(label) == 0:
#                    correctAnswer_0 += 1
#                else:
#                    incorrectAnswer_0 += 1

            now_line_cnt += 1
            print(now_line_cnt, len(TrueDataset), line_cnt)

        correctAnswer = correctAnswer_0 + correctAnswer_1
        print(correctAnswer * 1.0 / now_line_cnt, " ans0:", correctAnswer_0,
              " ans1:", correctAnswer_1, " miss:",
              now_line_cnt - correctAnswer)
        print(
            "詳細:",
            "p0t0",
            correctAnswer_0,
            "p0t1",
            incorrectAnswer_0,
            "p1t0",
            incorrectAnswer_1,
            "p1t1",
            correctAnswer_1,
        )

    label_cnt = 0
    for text, label in TrueDataset.items():
        if int(label) == 1:
            label_cnt += 1
    print(label_cnt / len(TrueDataset))
Exemple #17
0
    print("Processing Text:{}".format(i))
    if s == "":
        continue
    result = jumanpp.analysis(s)
    midasi_lst = []
    for w in result.mrph_list():
        midasi_lst.append([w.midasi.replace("_"," "),"O"])
    t_midasi.append(midasi_lst)
"""
print("-----------------")

for i, s in enumerate(wlist):
    print("Processing Word:{}".format(i))
    if s == "":
        continue
    result = jumanpp.analysis(s)
    midasi_lst = []
    for w in result.mrph_list():
        midasi = w.midasi.replace("_", "")
        if midasi == "":
            continue
        midasi_lst.append(midasi)
    w_midasi.append(midasi_lst)
"""
with open("./text_midasi.list","wb") as f:
    pickle.dump(t_midasi,f)
"""

with open("./word_midasi.list", "wb") as f:
    pickle.dump(w_midasi, f)
#-*- encoding: utf-8 -*-
from pyknp import Jumanpp
import sys
import codecs
# sys.stdin = codecs.getreader('utf_8')(sys.stdin)
# sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
# Use Juman++ in subprocess mode
jumanpp = Jumanpp()
result = jumanpp.analysis(u"ケーキを食べる")
for mrph in result.mrph_list():
    print("見出し:{0}".format(mrph.midasi))
Exemple #19
0
def main():

    print("fsovs")

    Topic = []
    Utterance = []
    Relevance = []

    regex  = u'[^ぁ-ん]+'

    #学習用データ form[label, Topic & Utterce]
    wf_Data = open("Tpc&UTR_Stance.csv","w")

    all_filepaths=glob.glob('./training/*')
    for filepath in all_filepaths:
        lines = [line.rstrip() for line in fileinput.input(
            filepath, openhook=fileinput.hook_encoded('utf-8'))]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])
        
        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Stance"])       

    TrueDataset = {}
    correctAnswer_0 = 0
    correctAnswer_1 = 0
    for line in list(set(Utterance)): 
        T_List = [] 
        R_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                T_List.append(Topic[line_l])
                R_list.append(Relevance[line_l])
        TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0])

    sorted(TrueDataset.items())

    # Analyze Utterance using Juman++ & knp
    jumanpp = Jumanpp()
    with open("incorrectTrus.txt","w") as wf:
        line_cnt = len(TrueDataset)
        now_line_cnt = 0
        for key, label in TrueDataset.items():
            tpc,utr = key.split(":")[0],key.split(":")[1]
            topANDutrANDlabelList = []

            #parse Topic
            topic_analyed_List = []
            topANDutrANDlabelList.append("Topic")
            try:
                #0.7909880035111675
                #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] 
                #topic_result = jumanpp.analysis(s)
                topic_result = jumanpp.analysis(format_text(tpc))
                #print(s)
                for mrph in topic_result.mrph_list():
                    try :
                        if len(re.findall(regex, mrph.genkei)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                if "数量" in mrph.imis:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append("[数]") 
                                else:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append(mrph.genkei)
                    except:
                        continue
            except:
                continue

        #parse Utterance
            utter_analyed_List = []
            topANDutrANDlabelList.append("Utterance")
            try:
                if "、" in utr:
                    utrList = utr.split("、")
                    for sentence in utrList:

                        #reigi
                        if sentence == "":
                            continue
                        
                        utter_result = jumanpp.analysis(sentence)
                        for mrph in utter_result.mrph_list():
                            try :
                                if len(re.findall(regex, mrph.genkei)) > 0:
                                    if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                        if "数量" in mrph.imis:
                                            utter_analyed_List.append(mrph.genkei)
                                            topANDutrANDlabelList.append("[数]") 
                                        else:
                                            utter_analyed_List.append(mrph.genkei)
                                            topANDutrANDlabelList.append(mrph.genkei)

                                else:
                                    continue
                            except:
                                print("error")
                                continue

                else:
                    utter_result = jumanpp.analysis(utr)
                    for mrph in utter_result.mrph_list():
                        try :
                            if len(re.findall(regex, mrph.genkei)) > 0:
                                if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                    if "数量" in mrph.imis:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append("[数]") 
                                    else:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append(mrph.genkei)
                        except:
                            print("error")
                            continue
                topANDutrANDlabelList.append("END")
                    
            except:
                print("error")
                continue

            if "END" in topANDutrANDlabelList:
                #print(topANDutrANDlabelList)
                wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+"\n")
            #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List)))

            #if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
                #print("1:",label)
            if int(label) == 1:
                wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n")
            elif int(label) == 2:
                wf.write(tpc + ":" + utr + "[" + "2" + ":" +label + "]\n")
            else:
                wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n")
Exemple #20
0
    try:
        a = str(row[1])
        text_list.append(a)
    except:
        print('read_fail')
        load_miss_count += 1
f.close()

num = 0

for line in text_list:
    text = ''
    num += 1
    
    try:
        result = jumanpp.analysis(line)
        print(num)
        for mrph in result.mrph_list():
            hinsi = mrph.hinsi
            if(hinsi == "" or hinsi == "" or hinsi == "" or hinsi == ""):
                text += str(mrhp.midasi) + ' '
        outF.write(str(text) + '\n')
        
    except (AttributeError, TypeError, KeyError, ValueError):
            print("missing_value")
            analyze_miss_count += 1
            continue    

outF.close()
print(load_miss_count)
print(analyze_miss_count)
Exemple #21
0
def main():

    all_filepaths = glob.glob('./training/*')
    #print("frhifr",all_filepaths)

    Topic = []
    Utterance = []
    Relevance = []
    FactCheck = []
    Stance = []

    for filepath in all_filepaths:

        # args = get_args()
        # JSON読み込み
        # src = '-' if not hasattr(args, 'json_file') else args.json_file

        lines = [
            line.rstrip() for line in fileinput.input(
                filepath, openhook=fileinput.hook_encoded('utf-8'))
        ]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])

        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Relevance"])
            FactCheck.append(argument["Fact-checkability"])
            Stance.append(argument["Stance"])

    TrueDataset = []
    for line in list(set(Utterance)):
        cnt = 0
        R_list = []
        F_list = []
        S_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                cnt += 1
                R_list.append(Relevance[line_l])
                F_list.append(FactCheck[line_l])
                S_list.append(Stance[line_l])
        plane = line + " " + str(
            Counter(R_list).most_common()[0][0]) + " " + str(
                Counter(F_list).most_common()[0][0]) + " " + str(
                    Counter(S_list).most_common()[0][0])
        if not ((cnt == 5 and Counter(S_list).most_common()[0][1] == 2) or
                (cnt == 3 and Counter(S_list).most_common()[0][1] == 1)):
            TrueDataset.append(plane)

    # Analyze Utterance using Juman++
    jumanpp = Jumanpp()
    for arguments in TrueDataset:
        #print(argument["Utterance"],argument["Relevance"],argument["Fact-checkability"],argument["Stance"],argument["Class"])
        argument = arguments.split(" ")
        result = jumanpp.analysis(argument[0])
        analyed_argument = ""
        for mrph in result.mrph_list():
            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                analyed_argument += mrph.midasi + " "

        analyed_argument += "\t"
        analyed_argument += argument[1] + "\t"
        analyed_argument += argument[2] + "\t"
        analyed_argument += argument[3]

        print(analyed_argument)
Exemple #22
0
#-*- encoding: utf-8 -*-
from pyknp import Jumanpp
import sys
import codecs

jumanpp = Jumanpp()
f = codecs.open(sys.argv[1], 'r', 'utf-8')
for line in f:
    text = line.rstrip()
    try:
        result = jumanpp.analysis(text)
        tokens = [mrph.midasi for mrph in result.mrph_list()]
        print('\t'.join(tokens))
    except ValueError:
        print('VALUE ERROR')

f.close

# # JUMAN++をsubprocessモードで使用
# jumanpp = Jumanpp()
# result = jumanpp.analysis(u"ケーキを食べる")
# for mrph in result.mrph_list():
#     print(u"見出し:{0}".format(mrph.midasi))
Exemple #23
0
                    help="classify text",
                    type=str,
                    default="日本でのビジネス")
parser.add_argument("--path_to_model",
                    help="model to use",
                    type=str,
                    default="./models/my-model.ckpt")

args = parser.parse_args()

jumanpp = Jumanpp()
classify_data = []

vocab = Vocabulary("data_use.txt")

result = jumanpp.analysis(args.input_text)
for mrph in result.mrph_list():
    word = mrph.midasi
    classify_data.append(vocab.stoi(word))

classify_data = data_helper.pad_one(classify_data, 256, 0)

with open("training_config.json") as f:
    params = json.load(f)

embedding_mat = np.load("./models/embedding.npy")
session_config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
sess = tf.Session(config=session_config)

with sess.as_default():
Exemple #24
0
def main():

    Topic = []
    Utterance = []
    Relevance = []

    regex  = u'[^ぁ-ん]+'

    all_filepaths=glob.glob('./testGS/*')
    for filepath in all_filepaths:
        lines = [line.rstrip() for line in fileinput.input(
            filepath, openhook=fileinput.hook_encoded('utf-8'))]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])
        
        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Relevance"])       

    TrueDataset = {}
    correctAnswer = 0
    for line in list(set(Utterance)): 
        T_List = [] 
        R_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                T_List.append(Topic[line_l])
                R_list.append(Relevance[line_l])
        TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0])
        
    trueCnt = 0
    for key, label in TrueDataset.items():
        if int(label) == 1:
            trueCnt += 1
    print("AllTrue:",trueCnt/ len(TrueDataset))

    # Analyze Utterance using Juman++ & knp
    jumanpp = Jumanpp()
    with open("incorrect_test.txt","w") as wf:
        line_cnt = len(TrueDataset)
        now_line_cnt = 0
        for key, label in TrueDataset.items():
            tpc,utr = key.split(":")[0],key.split(":")[1]

        #print(tpc + ":" + utr + "[" + label + "]")

        #parse Topic
            topic_analyed_List = []
            try:
                #0.7909880035111675
                #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] 
                #topic_result = jumanpp.analysis(s)
                topic_result = jumanpp.analysis(format_text(tpc))
                #print(s)
                for mrph in topic_result.mrph_list():
                    try :
                        if len(re.findall(regex, mrph.midasi)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): 
                                topic_analyed_List.append(mrph.midasi)
                    except:
                        continue
            except:
                #print("Error.",tpc)
                continue

        #parse Utterance
            utter_analyed_List = []
            try:
                utter_result = jumanpp.analysis(utr)
                for mrph in utter_result.mrph_list():
                    try :
                        if len(re.findall(regex, mrph.midasi)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): 
                                utter_analyed_List.append(mrph.midasi)
                    except:
                        continue
            except:
                #print("Error.",utr)
                continue

            #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List)))



            if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
                #print("1:",label)
                if int(label) == 1:
                    correctAnswer += 1
                else:
                    wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n")
            else:
                #print("0:",label)
                if int(label) == 0:
                    correctAnswer += 1
                else:
                    wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n")
            now_line_cnt += 1
            #print( now_line_cnt, "/", line_cnt)

    print("acurracy:",correctAnswer*1.0 / len(TrueDataset))
Exemple #25
0
# 学習開発データの作成

with open('data/training_data_sample.json', 'r') as f:
    corpus = json.load(f)

for key in corpus:

    with open('data/%s' % key, 'w') as f:

        for data in corpus[key]:
            text = data['text'].translate(han2zen)

            # 形態素解析処理

            mrphs = [
                mrph.midasi for mrph in jumanpp.analysis(text).mrph_list()
            ]

            # 文字位置から単語位置への変換辞書の作成

            c2w = {}
            c = 0
            w = 0
            for mrph in mrphs:
                for i in range(len(mrph)):
                    c2w[c] = w
                    c += 1
                w += 1

            # スロット列の作成