def tester_1(): # Simple knp = pyknp.KNP(option="-tab -dpnd", rcfile='/usr/local/etc/knprc', jumanrcfile='/usr/local/etc/jumanrc') test = "昨日ノーベル物理学賞について学んだ" tagged_test = extract_ne(test, knp, detail_flag=False) print(swap_ne_tag_with_only_tag(tagged_test[0], "ARTIFACT", "PRIZE"))
def prompt(): knp = pyknp.KNP(option="-tab -dpnd", rcfile='/usr/local/etc/knprc', jumanrcfile='/usr/local/etc/jumanrc') while True: print(">>> ", end="") line = input() tagged_line = ner_func.extract_ne(line, knp, detail_flag=False) print(tagged_line[0] + "\n")
def __init__(self, logger=None, verbose=False, jumanpp=False): self._knp = pyknp.KNP(jumanpp=jumanpp) self.verbose = verbose if logger: self.logger = logger else: self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.WARNING) logging.captureWarnings(True)
def tester_2(): # Swap with ne knp = pyknp.KNP(option="-tab -dpnd", rcfile='/usr/local/etc/knprc', jumanrcfile='/usr/local/etc/jumanrc') test = "昨日ノーベル物理学賞について学んだ" test1 = "昨日英語の教科書を買った" tagged_test = extract_ne(test, knp, detail_flag=False) tagged_test1 = extract_ne(test1, knp, detail_flag=False) print(swap_ne_tag_with_ne_and_tag(tagged_test[0], "ノーベル物理学賞", "PRIZE", tagged_test[2])) print(swap_ne_tag_with_ne_and_tag(tagged_test1[0], "教科書", "EDUCATION", tagged_test1[2])) print(tagged_test[3])
def _knp(sentence): if sentence == "": return None knp = pyknp.KNP() try: result = knp.parse(sentence) except: return None # 情報をクラスに格納 phrases = OrderedDict() # 文節クラスのディクショナリ for bnst in result.bnst_list(): ph = mynlp.Phrase() ph.parent_id = bnst.parent_id ph.dpndtype = bnst.dpndtype # この文節にふくまれる単語情報を格納 for mrph in bnst.mrph_list(): # mrph_list:文節内の形態素リスト word = mynlp.Word() word.surface = mrph.midasi # 表層形 word.base = mrph.genkei # 原型 word.yomi = mrph.yomi # 読み # 品詞関連詳細情報 pos_info = mrph.spec().split(" ") # or .new_spec() # 表層形 読み 見出し語 品詞大分類 品詞大分類ID 品詞細分類 品詞細分類ID 活用型 活用型ID 活用形 活用形ID 意味情報 word.pos = pos_info[3] # 品詞 word.pos_detail = pos_info[5] # 品詞細分類 # 意味情報関連 imis = mrph.imis.split() # 代表表記,漢字読み,カテゴリなど for imi in imis: if "代表表記" in imi: word.descriptions = imi.split(":", 1)[-1] elif "カテゴリ" in imi: word.category = imi.split(":", 1)[-1] elif "ドメイン" in imi: word.domain = imi.split(":", 1)[-1] elif ("人名:" in imi) or ("地名:" in imi): # 固有名詞 word.proper_noun = imi.split(":", 1)[-1] else: word.another = word.another + imi + " " ph.words.append(word) phrases[bnst.bnst_id] = ph for ph_i, ph in phrases.items(): if ph.parent_id != -1: phrases[ph.parent_id].children.append(ph_i) return phrases
def main(): print("label:x or o") string = '1990年生まれはみんなごはんとラーメンを一緒に食べることが普通だ' with open('./label_file') as fp: label = deque(fp.readlines()) label = None with open('./all_text.csv') as f: for line in f.readlines(): flag, string = line.split(',') if flag == 'x': print(string) knp = kp.KNP(option='-tab -anaphora') knp_parser = KNP_Parser() knp_parser.parse_knp(knp, string, label=label)
def __init__( self, knp_kwargs: Optional[Dict[str, str]] = None, preprocessor: Callable[[str], str] = None, ): import pyknp cmd = get_juman_command() assert cmd knp_kwargs = knp_kwargs or {} knp_kwargs.setdefault("jumancommand", cmd) self.knp = pyknp.KNP(**knp_kwargs) self.knp_kwargs = knp_kwargs
def get_ave_vec(line): hinsi_list = ['名詞', '形容詞', '動詞'] import pyknp knp = pyknp.KNP() decompose = True total_vec = [] result = knp.parse(line) for mrph in result.mrph_list(): if mrph.hinsi in hinsi_list: if decompose == True: midasi, ending = mrph2decomposed_str(mrph) if midasi in model.vocab: total_vec.append(model[midasi]) else: midasi = mrph2str(mrph) if total_vec == []: return False return sum(total_vec) / len(total_vec)
def tester(): knp = pyknp.KNP(option="-tab -dpnd", rcfile='/usr/local/etc/knprc', jumanrcfile='/usr/local/etc/jumanrc') line1 = "今年の人工知能学会は2016年6月6日~9日まで北九州国際会議場で開催されます" line2 = "昨夜,太郎は夜9時に花子へ会いに行った" line3 = "佐藤は昨夜,国会議事堂まで個人情報保護法についての議論を見に行った" line4 = "藤本太郎喜左衛門将時能という名前の人がいるらしい" tagged_line1 = ner_func.extract_ne(line1, knp, detail_flag=True) tagged_line2 = ner_func.extract_ne(line2, knp, detail_flag=True) tagged_line3 = ner_func.extract_ne(line3, knp, detail_flag=True) tagged_line4 = ner_func.extract_ne(line4, knp, detail_flag=True) print() print(line1) print(tagged_line1[0] + "\n") print(line2) print(tagged_line2[0] + "\n") print(line3) print(tagged_line3[0] + "\n") print(line4) print(tagged_line4[0] + "\n")
def __parse_bnst(line): """ 文節の格解析 :param line: in str :return: bnst_list from pyknp """ import pyknp # 詳細な分析結果の表示;照応解析あり knp = pyknp.KNP(option="-tab -anaphora") # 空白,タブの除去 line2 = "".join(line.split()) # 以下,文章を切り詰める応急処置. # TODO: 全文を解析できるように対応する必要あり. # 全角ピリオドを句点に置換後,右から区切る line2_list = line2.replace(".", "。").rsplit("。", 0) n = 1 while len(line2_list[0]) >= 218: # 全角ピリオドを句点に置換後,右から区切る line2_list = line2.replace(".", "。").rsplit("。", n) n += 1 results = knp.parse(line2_list[0]) return results.bnst_list()
def ner(self, line): """ $ python -m sagas.ja.knp_procs ner "太郎は5月18日の朝9時に花子に会いに行った." :param line: :return: """ import re # KNP prepairing: # option (str) – KNP解析オプション (詳細解析結果を出力する-tabは必須。 # 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) knp = pyknp.KNP(option="-tab -dpnd", jumanpp=False) def make_np_tagged_text(src_text: str): tagged_text = src_text # copy result = knp.parse(src_text) # tagging for tag in result.tag_list(): if "NE:" in tag.fstring: # if fstring has NE phrase span = result.get_tag_span(tag.tag_id) print('..', span, tag.fstring) # extract NE phrase search_r = re.search("<NE:(.*):(.*)>", tag.fstring) # tagged_ne_phrase = re.search("<NE:(.*):(.*)>", tag.fstring).group(0) # ne_phrase = re.search("<NE:(.*):(.*)>", tag.fstring).group(2) tagged_ne_phrase = search_r.group(0) ne_phrase = search_r.group(2) # overwrite to src text tagged_text = tagged_text.replace(ne_phrase, tagged_ne_phrase) return tagged_text tc.emp('green', line) tc.emp('yellow', make_np_tagged_text(line))
import re import csv import os import sys import pyknp title = sys.argv[1] #ff = codecs.open("fuman_a201508.json","r","utf-8") ff = codecs.open(title + ".json", "r", "utf-8") topic_json = json.loads(ff.read().encode("utf-8")) ff.close() stop_words = [u"の", u"こと"] # for temporary knp = pyknp.KNP() extracted_result = [] #{topicID,topicParameter,sentence,exracted,tokenized} for topics in topic_json: ## マッチした場合のスコア作成 word_match_score = {} for tt in range(0, len(topics[u"wordsInTopic"])): if topics[u"wordsInTopic"][tt] not in stop_words: word_match_score[topics[u"wordsInTopic"][tt]] = len( topics[u"wordsInTopic"]) - tt ## for sentences in topics[u"sentences"]: for ss in re.split("\n|。|!|?|!|\?", sentences.encode('utf-8')): ss = unicode(ss, 'utf-8') try: kres = knp.parse(ss.replace(' ', ''))
def __init__(self): self.knp = pyknp.KNP() self.ok_type = ['形容詞', '名詞', '動詞'] self.swapwords = self.__get_stopwords()
def createRelTrainData(filename, text_list, term_dic, n=3): juman_results = [] #各文に対して処理 for sec_i, text in enumerate(text_list): #print(text) juman_result = juman2mecab(execJuman(text)) juman_results.append(juman_result) mecab_result_list = list( filter(lambda x: x not in ["EOS", ""], juman_results)) head_poses = [] #アブスト各文の文頭位置 p = 0 for sentence in replaceDpoint(text_list[1].replace(".", ".")).split("."): head_poses.append(p) p += (len(sentence.replace("<dpoint>", ".")) + 1) #termを出現文によって分離 print("process term list") title_terms = [] #タプル(term,pos)のリスト abst_terms = {} #タプル(term,pos)のリスト の辞書 for p in head_poses: abst_terms[p] = [] #空のリストで初期化しておく for term, poses in term_dic.items(): for pos in poses: if pos[0] == 0: title_terms.append((term, pos)) else: abst_terms[getPosIndex(pos[2], head_poses)].append((term, pos)) #print(term,pos[2],getPosIndex(pos[2],head_poses)) terms_list = abst_terms.values() #print(abst_terms.keys()) #for k,v in abst_terms.items(): # print("key:",k," value:",v) #print("len(terms_list)= ",len(terms_list)) print("process KNP") knp = pyknp.KNP(command='knp', option='-tab -anaphora', jumancommand='jumanpp', jumanpp=True) knp_results = [[], []] #list of title,list of abst knp_results[0].append(knp.parse(text_list[0])) for sentence in re.split(r"\.|。", replaceDpoint(text_list[1].replace(".", "."))): #for sentence in replaceDpoint(titleabst_str[1].replace(".",".")).split("."): #print(sentence.replace("<dpoint>",".")) knp_results[1].append( knp.parse(sentence.replace("<dpoint>", ".") + ".")) #print(sentence.replace("<dpoint>",".")+".") head_morph_ids = [0] #abst各文の先頭の形態素番号を保持 #print(re.split(r"\.|。",replaceDpoint(titleabst_str[1].replace(".",".")).replace("<dpoint>","."))) #sys.exit() now = 0 for rslt in knp_results[1]: now = now + len(rslt.mrph_list()) head_morph_ids.append(now) #print(len(rslt.mrph_list())) #for m in rslt.mrph_list(): # print(m.midasi+" ",end="") #print() head_morph_ids.pop() feature_datas = [] done_mrph_num = 0 done_mrph_num_next = 0 print("process each term") for i, terms in enumerate(terms_list): print(terms) #print(i," ",terms) if i > 0: done_mrph_num_next += len(knp_results[1][i - 1].mrph_list()) for termL in terms: # L->R #print("same sentence") #print(termL) s_posL = termL[1][1] - done_mrph_num e_posL = termL[1][1] - done_mrph_num sec_numL = termL[1][0] #単語が含まれているknp解析結果を取得 if sec_numL == 0: knprslt = knp_results[0][0] s_idL = termL[1][1] else: #print(term,e_pos,":") for h_i, head in enumerate(head_morph_ids): if e_posL < head: #print(e_pos,head) knprslt = knp_results[1][h_i - 1] h = head_morph_ids[h_i - 1] break s_idL = e_posL - h #その文での形態素id #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h) e_idL = s_idL tmp_term_len = len( mecab_result_list[sec_numL][e_posL][0]) #e_pos求める while tmp_term_len != len(termL[0]): #print(e_posL," ",knp_results[i-1].mrph_list()[e_posL+1].midasi) tmp_term_len += len(mecab_result_list[sec_numL][e_posL + 1][0]) e_posL += 1 e_idL += 1 kihonL_b, kihonL_f, hinshiL_b, hinshiL_f = getBehindFrontNMorphenesByKNP( knprslt, s_idL, e_idL, n) #この辺途中 kihon_kakariL_f, hinshi_kakariL_f = getBehindFrontNMorphenesKakariByKNP( knprslt, s_idL, e_idL, n) #print("start:",termL[1][1]," end:",e_posL) #print(kihonL,hinshiL) for termR in terms: if isSameTerm(termL, termR): continue print(termL[0], "->", termR[0]) s_posR = termR[1][1] - done_mrph_num e_posR = termR[1][1] - done_mrph_num sec_numR = termR[1][0] #単語が含まれているknp解析結果を取得 if sec_numR == 0: knprslt = knp_results[0][0] s_idR = termR[1][1] else: #print(term,e_pos,":") for h_i, head in enumerate(head_morph_ids): if e_posR < head: #print(e_pos,head) knprslt = knp_results[1][h_i - 1] h = head_morph_ids[h_i - 1] break s_idR = e_posR - h #その文での形態素id #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h) e_idR = s_idR tmp_term_len = len( mecab_result_list[sec_numR][e_posR][0]) #e_pos求める while tmp_term_len != len(termR[0]): tmp_term_len += len(mecab_result_list[sec_numR][e_posR + 1][0]) e_posR += 1 e_idR += 1 #print(termL[0]," ",termR[0]," ",knpresult.mrph_list()[s_posR].midasi," ",knp_results[i-1].mrph_list()[e_posR].midasi) kihonR_b, kihonR_f, hinshiR_b, hinshiR_f = getBehindFrontNMorphenesByKNP( knprslt, s_idR, e_idR, n) #この辺途中 kihon_kakariR_f, hinshi_kakariR_f = getBehindFrontNMorphenesKakariByKNP( knprslt, s_idR, e_idR, n) tmpdata = { "termL": termL[0], "termR": termR[0], "termLpos": str(termL[1][0]) + "," + str(termL[1][1]) + "," + str(termL[1][2]), "termRpos": str(termR[1][0]) + "," + str(termR[1][1]) + "," + str(termR[1][2]) } """ tmpdata["kihonL_before_appear"]=(kihonL_b) tmpdata["kihonL_front_appear"]=(kihonL_f) tmpdata["hinshiL_before_appear"]=(hinshiL_b) tmpdata["hinshiL_front_appear"]=(hinshiL_f) tmpdata["kihonL_kakari_front_appear"]=(kihon_kakariL_f) tmpdata["hinshiL_kakari_front_appear"]=(hinshi_kakariL_f) tmpdata["kihonR_before_appear"]=(kihonR_b) tmpdata["kihonR_front_appear"]=(kihonR_f) tmpdata["hinshiR_before_appear"]=(hinshiR_b) tmpdata["hinshiR_front_appear"]=(hinshiR_f) tmpdata["kihonR_kakari_front_appear"]=(kihon_kakariR_f) tmpdata["hinshiR_kakari_front_appear"]=(hinshi_kakariR_f) """ #extend_feature_vector_rel(tmpdata,termL[0],kihonL_b,kihonL_f,hinshiL_b,hinshiL_f,kihon_kakariL_f,hinshi_kakariL_f,termR[0],kihonR_b,kihonR_f,hinshiR_b,hinshiR_f,kihon_kakariR_f,hinshi_kakariR_f) #この辺途中 #extend_feature_vector_kaku_rel(tmpdata,termL[1][1],e_posL,knp_results[i-1]) # 格の種類 左 #extend_feature_vector_kaku_rel(tmpdata,termR[1][1],e_posR,knp_results[i-1])#右 #extend_feature_vector_kakarinum_rel(tmpdata,termL[0],termL[1][1],e_posL,termR[0],termR[1][1],e_posR,knp_results[i-1]) # 係り受け数 #extend_feature_vector_kakaritype_rel(tmpdata,termR[1][1],e_posR,knp_results[i-1]) # 係り受けのタイプ feature_datas.append(tmpdata) if i + 1 != len(terms_list): #次の文 #print("next sentence") #print(list(terms_list)[i+1]) for termR in list(terms_list)[i + 1]: print(termL[0], "->", termR[0]) s_posR = termR[1][1] - done_mrph_num_next e_posR = termR[1][1] - done_mrph_num_next if isSameTerm(termL, termR): continue print(termL[0], "->", termR[0]) s_posR = termR[1][1] - done_mrph_num e_posR = termR[1][1] - done_mrph_num sec_numR = termR[1][0] #単語が含まれているknp解析結果を取得 if sec_numR == 0: knprslt = knp_results[0][0] s_idR = termR[1][1] else: #print(term,e_pos,":") for h_i, head in enumerate(head_morph_ids): if e_posR < head: #print(e_pos,head) knprslt = knp_results[1][h_i - 1] h = head_morph_ids[h_i - 1] break s_idR = e_posR - h #その文での形態素id #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h) e_idR = s_idR tmp_term_len = len( mecab_result_list[sec_numR][e_posR][0]) #e_pos求める while tmp_term_len != len(termR[0]): #print(termR[0]) #print(" ",knp_results[i].mrph_list()[e_posR].midasi) tmp_term_len += len( mecab_result_list[sec_numR][e_posR + 1][0]) e_posR += 1 e_idR += 1 #print(termR[1][1]) #print(termL[0]," ",termR[0]," ",knpresult.mrph_list()[s_posR].midasi," ",knp_results[i].mrph_list()[e_posR].midasi) kihonR_b, kihonR_f, hinshiR_b, hinshiR_f = getBehindFrontNMorphenesByKNP( knprslt, s_idR, e_idR, n) #この辺途中 kihon_kakariR_f, hinshi_kakariR_f = getBehindFrontNMorphenesKakariByKNP( knprslt, s_idR, e_idR, n) tmpdata = { "termL": termL[0], "termR": termR[0], "termLpos": str(termL[1][0]) + "," + str(termL[1][1]) + "," + str(termL[1][2]), "termRpos": str(termR[1][0]) + "," + str(termR[1][1]) + "," + str(termR[1][2]) } """ tmpdata["kihonL_before_appear"]=(kihonL_b) tmpdata["kihonL_front_appear"]=(kihonL_f) tmpdata["hinshiL_before_appear"]=(hinshiL_b) tmpdata["hinshiL_front_appear"]=(hinshiL_f) tmpdata["kihonL_kakari_front_appear"]=(kihon_kakariL_f) tmpdata["hinshiL_kakari_front_appear"]=(hinshi_kakariL_f) tmpdata["kihonR_before_appear"]=(kihonR_b) tmpdata["kihonR_front_appear"]=(kihonR_f) tmpdata["hinshiR_before_appear"]=(hinshiR_b) tmpdata["hinshiR_front_appear"]=(hinshiR_f) tmpdata["kihonR_kakari_front_appear"]=(kihon_kakariR_f) tmpdata["hinshiR_kakari_front_appear"]=(hinshi_kakariR_f) """ #extend_feature_vector_rel(tmpdata,termL[0],kihonL_b,kihonL_f,hinshiL_b,hinshiL_f,kihon_kakariL_f,hinshi_kakariL_f,termR[0],kihonR_b,kihonR_f,hinshiR_b,hinshiR_f,kihon_kakariR_f,hinshi_kakariR_f) #この辺途中 #extend_feature_vector_joshi_rel(tmpdata,s_posL,e_posL,knp_results[i-1]) # 前後の助詞 左 #extend_feature_vector_joshi_rel(tmpdata,s_posR,e_posR,knp_results[i])#右 # 格の種類 # 係り受け数 # 係り受けのタイプ feature_datas.append(tmpdata) if i > 0: done_mrph_num += len(knp_results[1][i - 1].mrph_list()) return feature_datas
def __init__(self): self.knp = pyknp.KNP()
def knp_multithread(): return pyknp.KNP(multithreading=True)
def knp(): return pyknp.KNP()
def __init__(self, data_dir, svg_dir_url, keyword='water', data_date='20200120', threshold='0.6', category_file=None): # keywordが上位カテゴリならその下位カテゴリをkeyword_listに,下位カテゴリならそれをkeyword_list if category_file is not None: with open(category_file) as f: categ_lines = f.readlines() categ_dic = {} for line in categ_lines: categ = line.strip().split() super_categ = categ[2] sub_categ = categ[3] categ_dic[super_categ] = categ_dic.get(super_categ, []) + [sub_categ] if keyword in categ_dic: self.keyword_list = categ_dic[keyword] else: self.keyword_list = [keyword] else: self.keyword_list = [keyword] self.data_dir = data_dir self.svg_dir_url = svg_dir_url self.threshold = str(float(threshold)) self.data_date = data_date self.negative_pattern_list = ['不満だ/ふまんだ', '嫌だ/いやだ', '困る/こまる'] self.knp = pyknp.KNP( jumancommand='/mnt/violet/share/tool/juman++v2/bin/jumanpp') self.rep2cluster = {} self.event_json = {} self.cluster_vec_dict = {} self.url_format_str = {} self.rep2events = {} for keyword in self.keyword_list: clustering_file = os.path.join( self.data_dir, 'clustering/{}/{}_{}.json'.format(data_date, keyword, threshold)) cluster_vec_file = os.path.join( self.data_dir, 'clustering/{}/{}_{}.vec.pickle'.format( data_date, keyword, threshold)) event_path = os.path.join( self.data_dir, 'event_pairs/{}/{}.event_pairs.json'.format( data_date, keyword)) self.url_format_str[keyword] = os.path.join( self.svg_dir_url, '{}/{}/{{}}.svg'.format(data_date, keyword)) self.rep2cluster[keyword] = {} self.event_json[keyword] = [] self.cluster_vec_dict[keyword] = {} self.rep2events[keyword] = defaultdict(list) with open(clustering_file, 'r') as f: for key, value in json.load(f).items(): for v in value: self.rep2cluster[keyword][v] = key with open(event_path, 'r') as f: self.event_json[keyword] = json.load(f) with open(cluster_vec_file, 'rb') as f: self.cluster_vec_dict[keyword] = pickle.load(f) for event in self.event_json[keyword]: self.rep2events[keyword][event["modifier_reps"]].append( (event, "modifier")) self.rep2events[keyword][event["head_reps"]].append( (event, "head"))
def __init__(self): self.original_knp = pyknp.KNP() self.knp_dict = {} if os.path.exists('knp.pickle'): self.load()
async def on_delete(self, req, resp): self.knp = pyknp.KNP()
def processEachTerm(term_num, term_dic, mecab_result_list, n, titleabst_str=[], keywords=[], fulltext=""): #n:素性とするngramの範囲 """ 各語に対する素性抽出処理 素性データは[対象語,出現場所pos,出現頻度,1文字か,タイトルに含まれるか,アブストに含まれるか,キーワードに含まれるか,前後n形態素基本形列挙,前後n形態素品詞列挙,基本形ベクトル化,品詞ベクトル化] """ knp = pyknp.KNP(command='knp', option='-tab -anaphora', jumancommand='jumanpp', jumanpp=True) knp_results = [[], []] #list of title,list of abst knp_results[0].append(knp.parse(titleabst_str[0])) for sentence in re.split(r"\.|。", replaceDpoint(titleabst_str[1].replace(".", "."))): #for sentence in replaceDpoint(titleabst_str[1].replace(".",".")).split("."): #print(sentence.replace("<dpoint>",".")) knp_results[1].append( knp.parse(sentence.replace("<dpoint>", ".") + ".")) #print(sentence.replace("<dpoint>",".")+".") head_morph_ids = [0] #abst各文の先頭の形態素番号を保持 #print(re.split(r"\.|。",replaceDpoint(titleabst_str[1].replace(".",".")).replace("<dpoint>","."))) #sys.exit() now = 0 for rslt in knp_results[1]: now = now + len(rslt.mrph_list()) head_morph_ids.append(now) #print(len(rslt.mrph_list())) #for m in rslt.mrph_list(): # print(m.midasi+" ",end="") #print() head_morph_ids.pop() #for rslt in knp_results: # for r in rslt: # for m in r.mrph_list(): # print(m.midasi) #num=0 #for rslt in knp_results[1]: # print(rslt.mrph_list()[0].midasi,num) # num+=len(rslt.mrph_list()) #print(head_morph_ids) #sys.exit() outputdata = [] #素性 dictのlist returnする freq_list = getFreqList(term_dic, fulltext) for term, pos_list in term_dic.items(): in_title = "0.0" in_abst = "0.0" in_kw = "0.0" if term in titleabst_str[0]: in_title = "1.0" if term in titleabst_str[1]: in_abst = "1.0" if term in keywords: int_kw = "1.0" #freq=str(calcFreqFeature(freq_list,len(pos_list),10))#これ使うか単純に出現頻度そのまま入れるか freq = str(calcTfidfFeature(term, len(pos_list), term_num)) is_uni = "0.0" if len(term) == 1: is_uni = "1.0" digit_rate = str(digit_num_per_term(term)) alpha_rate = str(alpha_num_per_term(term)) #tmpdata=[term,freq,is_uni,digit_rate,alpha_rate,in_title,in_abst,in_kw] tmpdata = { "term": term, "freq": freq, "is_uni": is_uni, "digit_rate": digit_rate, "alpha_rate": alpha_rate, "in_title": in_title, "in_abst": in_abst, "in_kw": in_kw } #print("term : ",term) for pos in pos_list: #print(" pos : ",pos) e_pos = pos[1] sec_num = pos[0] #単語が含まれているknp解析結果を取得 if sec_num == 0: knprslt = knp_results[0][0] s_id = pos[1] else: #print(term,e_pos,":") for h_i, head in enumerate(head_morph_ids): if e_pos < head: #print(e_pos,head) knprslt = knp_results[1][h_i - 1] h = head_morph_ids[h_i - 1] break s_id = e_pos - h #その文での形態素id #print(term,knprslt.mrph_list()[s_id].midasi,s_id,h) e_id = s_id tmp_term_len = len(mecab_result_list[sec_num][e_pos][0]) #print(term) while tmp_term_len != len(term): #単語の末尾posを求める #print(" ",mecab_result_list[sec_num][e_pos+1][0]) tmp_term_len += len(mecab_result_list[sec_num][e_pos + 1][0]) e_pos += 1 e_id += 1 #print(term,s_id,e_id,knprslt) #kihon_b,kihon_f,hinshi_b,hinshi_f=getBehindFrontNMorphenes(mecab_result_list[sec_num],pos[1],e_pos,n) kihon_b, kihon_f, hinshi_b, hinshi_f = getBehindFrontNMorphenesByKNP( knprslt, s_id, e_id, n) #print(kihon_b,term,kihon_f) kihon_kakari_f, hinshi_kakari_f = getBehindFrontNMorphenesKakariByKNP( knprslt, s_id, e_id, n) #print(term,kihon_kakari_f) #print("".join(kihon[:4]),term,"".join(kihon[4:])) tmpdata["pos"] = pos tmpdata["kihon_before_appear"] = (kihon_b) tmpdata["kihon_front_appear"] = (kihon_f) tmpdata["hinshi_before_appear"] = (hinshi_b) tmpdata["hinshi_front_appear"] = (hinshi_f) tmpdata["kihon_kakari_front_appear"] = (kihon_kakari_f) tmpdata["hinshi_kakari_front_appear"] = (hinshi_kakari_f) extend_feature_vector(tmpdata, term, kihon_b, kihon_f, hinshi_b, hinshi_f, kihon_kakari_f, hinshi_kakari_f) outputdata.append(tmpdata) tmpdata = { "term": term, "freq": freq, "is_uni": is_uni, "digit_rate": digit_rate, "alpha_rate": alpha_rate, "in_title": in_title, "in_abst": in_abst, "in_kw": in_kw } #sys.exit() return outputdata