def _apply_knp(self, sent: str) -> str: self.logger.info(f'parse sentence: {sent}') knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_dpnd_option) knp_result = knp.parse(sent) if self.remote_knp is True: _, jumanpp_conll_out = self._apply_jumanpp(sent) clientsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.logger.info(f'connect to {self.knp_host}:{self.knp_port}') clientsock.connect((self.knp_host, self.knp_port)) clientsock.sendall(jumanpp_conll_out.encode('utf-8')) buf = [] while True: data = clientsock.recv(8192) data_utf8 = data.decode('utf-8') buf.append(data_utf8) if data_utf8.endswith('EOS\n'): break clientsock.close() conllu_out = ''.join(buf) self.logger.info(f'received {len(conllu_out)} chars from remote KNP') # modify KNP result by conllu result of remote KNP head_ids, dpnd_types = self._read_conllu_from_buf(conllu_out) self._modify_knp(knp_result, head_ids, dpnd_types) # add predicate-argument structures by KNP knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_case_option) knp_result_new = knp.parse_juman_result(knp_result.spec()) return knp_result_new.spec()
def generate_knowledge(sentence): ##knpで解析 knp = KNP(option='-tab -anaphora') result = knp.parse(sentence.replace(" ", "")) bnst_list = result.bnst_list() #文節辞書 bnst_dic = dict((x.bnst_id, x) for x in bnst_list) infos = [] info = dict((x, None) for x in info_elements) for bnst in bnst_list: place = get_gimonshi(bnst, bnst_dic) if (place == None): pass elif info[place] == None: info[place] = select_normalization_representative_notation( bnst.fstring) else: infos.append(info) del info info = dict((x, None) for x in info_elements) info[place] = select_normalization_representative_notation( bnst.fstring) return infos
def __init__( self, dataset: PASDataset, logger: Logger, use_knp_overt: bool = True, ) -> None: self.examples: List[PasExample] = dataset.examples self.cases: List[str] = dataset.target_cases self.bridging: bool = dataset.bridging self.coreference: bool = dataset.coreference self.relations: List[str] = dataset.target_cases + ( ['ノ'] * self.bridging) + (['='] * self.coreference) self.exophors: List[str] = dataset.target_exophors self.index_to_special: Dict[int, str] = { idx: token for token, idx in dataset.special_to_index.items() } self.documents: List[Document] = dataset.documents self.logger = logger self.use_knp_overt = use_knp_overt self.kc: bool = dataset.kc self.reader = dataset.reader cfg = configparser.ConfigParser() cfg.read(Path(__file__).parent.parent / 'analyzer' / 'config.ini') if 'default' not in cfg: logger.warning( 'Analyzer config not found. Instead, use default values.') cfg['default'] = {} section = cfg['default'] knp_command = section.get('knp_command', shutil.which('knp')) jumanpp_command = section.get('juman_command', shutil.which('jumanpp')) self.knp = KNP(command=knp_command, option='-tab -case2', jumancommand=jumanpp_command)
def __init__(self): self._knp = KNP() self._valid_nominative_strings = \ self._load_file('valid_nominative_strings.txt') self._valid_nominative_semantic_markers = \ self._load_file('valid_nominative_semantic_markers.txt') self._volition_modalities = \ self._load_file('volition_modalities.txt') self._volition_voices = \ self._load_file('volition_voices.txt') self._non_volition_voices = \ self._load_file('non_volition_voices.txt') self._volition_adverb_repnames = \ self._load_file('volition_adverb_repnames.txt') self._non_volition_adverb_repnames = \ self._load_file('non_volition_adverb_repnames.txt') self._valid_adjective_predicate_suffix_repnames = \ self._load_file('valid_adjective_predicate_suffix_repnames.txt') self._non_volition_verbal_suffix_semantic_labels = \ self._load_file('non_volition_verbal_suffix_semantic_labels.txt') self._non_volition_verbal_suffix_repnames = \ self._load_file('non_volition_verbal_suffix_repnames.txt') self._non_volition_types = \ self._load_file('non_volition_types.txt') self._non_volition_head_repnames = \ self._load_file('non_volition_head_repnames.txt') self._non_volition_semantic_labels = \ self._load_file('non_volition_semantic_labels.txt')
def select_dependency_structure(line): """係り受け構造を抽出します """ # KNP knp = KNP(option='-tab -anaphora') # 解析 result = knp.parse(line) # 文節リスト bnst_list = result.bnst_list() # 文節リストをidによるディクショナリ化する bnst_dic = dict((x.bnst_id, x) for x in bnst_list) tuples = [] for bnst in bnst_list: if bnst.parent_id != -1: # (from, to) print("bnst_id:{} parent_id:{}\n".format(bnst.bnst_id, bnst.parent_id)) tuples.append( (select_normalization_representative_notation(bnst.fstring), select_normalization_representative_notation( bnst_dic[bnst.parent_id].fstring))) return tuples
def main(): knp = KNP(jumanpp=True, option='-tab -assignf') parser = argparse.ArgumentParser() parser.add_argument("--npyfile", "-m") parser.add_argument("--vocabfile", "-v") parser.add_argument("--topk", "-k", type=int, default=5) parser.add_argument("--query", "-q", type=str, default='') parser.add_argument("--cnpyfile", "-c", type=str, default='') parser.add_argument("--cvocabfile", "-u", type=str, default='') args = parser.parse_args() npyfile = args.npyfile vocabfile = args.vocabfile topk = args.topk query = args.query cnpyfile = args.cnpyfile cvocabfile = args.cvocabfile w2vec = load_model(npyfile, vocabfile) c2vec = {} cvocabs = [] if cnpyfile and cvocabfile: c2vec = load_model(cnpyfile, cvocabfile) if query: parse_and_print(query, knp, w2vec, topk, c2vec) return while True: q = input() parse_and_print(q, knp, w2vec, topk, c2vec)
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/mountains_ja.knp.gz' vocabfile = sys.argv[2] # '../dataset/mountains_ja.vocab.gz' assert 'gz' in knpfile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(vocabfile, mode='wt', encoding='utf8') as ofp: write_vocab(knp, ifp, ofp)
def init(): r = request print(r.data) global knp knp = KNP() global queue queue = deque() result = {"text":"init done"} return jsonify(ResultSet=result)
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz' wakatifile = sys.argv[2] # '../dataset/wikipedia.deps' assert 'gz' in knpfile assert 'gz' in wakatifile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(wakatifile, mode='wt', encoding='utf8') as ofp: write_wakati(knp, ifp, ofp, WORDFORM.LONGEST)
def __init__( self, knp: Optional[KNP] = None, jumanpp: bool = True, fallback_juman: bool = True, ): self.knp = KNP(jumanpp=jumanpp) if knp is None else knp self.juman = self.knp.juman self.knp.parse("。") # self.knp.socketやsubprocessを生成させるため self.fallback_juman = fallback_juman
def tag(text: str) -> (list, list): ''' return tag_ids: [(子基本句ID, 親基本句ID), ...] ''' knp = KNP() tag_list = knp.parse(text).tag_list() tag_ids = list() for tag in tag_list: # 各基本句へのアクセス if tag.parent_id != -1: tag_ids.append((tag.tag_id, tag.parent_id)) return tag_list, tag_ids
def total_chunk2(text): from pyknp import KNP knp = KNP() result = knp.parse(text) num = 0 for bnst in result.bnst_list(): if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None": print("".join(mrph.midasi for mrph in bnst.mrph_list())) num += 1 return num
def tag(text: str) -> (list, list): ''' return tag_ids: [(子基本句ID, 親基本句ID), ...] ''' knp = KNP() tag_list = knp.parse(text).tag_list() tag_ids = list() for tag in tag_list: # 各基本句へのアクセス if re.search('<格関係', tag.fstring): for i in re.findall(r'格関係\d', tag.fstring): tag_ids.append((int(re.sub(r'格関係', '', i)), tag.tag_id)) return tag_list, tag_ids
def __init__(self): '''initialize Examples -------- >>> nlp = JNLP() None ''' self.juman = Juman() self.KNP = KNP(option='-tab -anaphora')
def extract_poems(lines: List[str], jobs: int) -> List[Tuple]: knp = KNP(jumanpp=True) chunk_size = len(lines) // jobs + 1 arguments = [(lines[i:i + chunk_size], knp) for i in range(0, len(lines), chunk_size)] with mp.Pool(jobs) as p: checked_chunks = p.starmap(_extract_poems, arguments) poems = [] for chunk in checked_chunks: poems.extend(chunk) return poems
def get_u_gimonshi(sentence): line = sentence.replace(" ", "") knp = KNP(option='-tab -anaphora') result = knp.parse(line) bnst_list = result.bnst_list() bnst_dic = dict((x.bnst_id, x) for x in bnst_list) u_gimonshi = "" for bnst in bnst_list: place = question(bnst, bnst_dic) if place != None: u_gimonshi = place #print(u_gimonshi) return u_gimonshi
def load_knp_from_stream(f, juman_format=JUMAN_FORMAT.DEFAULT): """ KNPフォーマットの解析結果ファイルを解釈し、文節列オブジェクトを返す Args: f (file): KNPフォーマットの解析結果のファイルオブジェクト juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Yields: BList: 文節列オブジェクト """ knp = KNP() buf = "" for line in f: buf += line if line.startswith("EOS"): yield knp.result(buf, juman_format=juman_format) buf = ""
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # './wiki_00.knp.gz' outdir = sys.argv[2] assert '.knp.gz' in knpfile bn = os.path.basename(knpfile) head = os.path.join(outdir, bn.split('.')[0]) tffile_ab = head + '.ab.tf.gz' # './wiki_00.ab.tf.gz' dffile_ab = head + '.ab.df.gz' # './wiki_00.ab.df.gz' tffile_anob = head + '.anob.tf.gz'# './wiki_00.anob.tf.gz' dffile_anob = head + '.anob.df.gz'# './wiki_00.anob.df.gz' with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(tffile_ab, mode='wt', encoding='utf8') as ofp_tf_ab,\ gzip.open(dffile_ab, mode='wt', encoding='utf8') as ofp_df_ab,\ gzip.open(tffile_anob, mode='wt', encoding='utf8') as ofp_tf_anob,\ gzip.open(dffile_anob, mode='wt', encoding='utf8') as ofp_df_anob: parse_and_write(knp, ifp, ofp_tf_anob, ofp_tf_ab, ofp_df_anob, ofp_df_ab)
def total_chunk(text): from pyknp import KNP knp = KNP() sentences = [] num = 0 for stc in text.split("。"): if not stc == "": try: result = knp.parse(stc) for bnst in result.bnst_list(): if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None": num += 1 except Exception: pass return num
def get_context_words(self, sentence_size_limit=100): knp = KNP() knp_extractor = KNP_extractor(self.config.knp_index_db, self.config.knp_parent_dir, self.config.knp_sub_index_length) context_words = Counter() for index, sent_tuple in enumerate(self.sents[:sentence_size_limit]): sid = sent_tuple.sid.split('%')[0] sup_knp = knp_extractor.get_knp(sid) if not sup_knp: sys.stderr.write("fail to convert knp of %s.\n" % sid) continue try: result = knp.result(sup_knp.decode('utf-8')) context_words.update(self._get_sentence_args(result)) except: sys.stderr.write("fail to convert knp of %s.\n" % sid) context_words = dict(context_words) return context_words
def read_knp_result_file(filename: str) -> List[BList]: """Read a KNP result file. Args: filename: A filename. Returns: A list of :class:`pyknp.knp.blist.BList` objects. """ knp = KNP() blists = [] with open(filename, "rt", encoding="utf-8", errors="replace") as f: chunk = "" for line in f: chunk += line if line.strip() == "EOS": blists.append(knp.result(chunk)) chunk = "" return blists
def __init__( self, word2vec_model: Word2VecModel, juman_command: str='jumanpp', specific_parts: Optional[List[str]]=None ) -> None: if specific_parts is None: specific_parts = ['普通名詞'] if juman_command == 'juman': self.juman: Union[Juman, Jumanpp] = Juman() elif juman_command == 'jumanpp': self.juman: Union[Juman, Jumanpp] = Jumanpp() else: raise AttributeError self.knp: KNP = KNP(jumancommand=juman_command) self.specific_parts: List[str] = specific_parts self.word2vec: Word2VecModel = word2vec_model
def main(): knp = KNP(jumanpp=True, option='-tab') knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz' vocabfile = sys.argv[2] # '../dataset/wikipedia.vocab' depsfile = sys.argv[3] # '../dataset/wikipedia.deps' # 全単語集合に関する頻度足切り vocab_thre = 100 assert 'gz' in vocabfile with gzip.open(vocabfile, mode='rt', encoding='utf8') as ifp: vocab = read_vocab(ifp, vocab_thre) # extract dependency pairs from a knp parsed file. # CoNLL: tokens = [(id,form, head,deprel)] assert 'gz' in knpfile assert 'gz' in depsfile with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\ gzip.open(depsfile, mode='wt', encoding='utf8') as ofp: write_deps(knp, vocab, ifp, ofp)
def generate_utterance(u_sen, all_infos): #何を聞かれているかを判断 question = get_u_gimonshi(u_sen) #質問の答えがどこにあるかを検索 knp = KNP(option='-tab -anaphora') result = knp.parse(u_sen.replace(" ", "")) bnst_list = result.bnst_list() search_words = [] for bnst in bnst_list: search_words.append( create_infos.select_normalization_representative_notation( bnst.fstring)) search_point = -1 for search_word in search_words: for i, info in enumerate(all_infos): if (search_word in info.values()): search_point = i answer = "" if search_point == -1: answer = "No information" else: if all_infos[search_point][question] != None: #質問の答えがその場所にあるとき answer = all_infos[search_point][question] + "です" else: #質問の答えがその場所にないとき上下の情報を探索 if search_point == 0: if all_infos[search_point + 1][question] != None: answer = all_infos[search_point + 1][question] + "です" else: answer = "No information" else: if all_infos[search_point - 1][question] != None: answer = all_infos[search_point - 1][question] + "です" elif all_infos[search_point + 1][question] != None: answer = all_infos[search_point + 1][question] + "です" else: answer = "No information" return answer
def __init__(self, text: str, delimiter: str = '\n'): self.text = text self.delimiter = delimiter self.sentences = util.split_text(self.text, delimiter) self.n_sentences = len(self.sentences) self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False) self.trees = self._trees() self.juman = Juman(jumanpp=False) self.rs_pos = self.calc_rs_pos() self.n_mrphs = self.calc_n_mrphs() self.n_chunks = self.calc_n_chunks() self.n_types = self.calc_n_types() self.mean_n_mrphs = None \ if self.n_sentences == 0 \ else self.n_mrphs / self.n_sentences self.rs_modality = self.calc_rs_modality() self.r_conditional = None \ if self.n_sentences == 0 \ else self.calc_n_conditionals() / self.n_sentences self.mean_tree_depths = self.calc_mean_tree_depths()
def evg(): parser = argparse.ArgumentParser() parser.add_argument("--output", "-o", default="", help="path to output") args = parser.parse_args() basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") knp = KNP() results = [] chunk = "" for line in codecs.getreader("utf-8")(getattr(sys.stdin, "buffer", sys.stdin)): chunk += line if line.strip() == "EOS": results.append(knp.result(chunk)) chunk = "" evg_ = EventGraph.build(results) if args.output: evg_.save(args.output) else: print(json.dumps(evg_.to_dict(), indent=4, ensure_ascii=False))
def test(): # ex.)echo "私は自然言語処理の研究をする" | juman | knp -tab -dpnd | python DependencyParser.py import codecs sys.stdin = codecs.getreader('UTF-8')(sys.stdin) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) knp = KNP() data = u"" for line in iter(sys.stdin.readline, ""): data += line if line.strip() == u"EOS": result = knp.result(data) DB = parseDependency(result.bnst_list(), head=False) DBhead = parseDependency(result.bnst_list(), head=True) print "parent-child" # for bnstrep in DB: # print bnstrep for bnstrep in DBhead: print bnstrep data = u""
from pyknp import KNP knp = KNP(jumanpp=True) result = knp.parse("望遠鏡で泳いでいる女の子を見た。") for bnst in result.bnst_list(): parent = bnst.parent if parent is not None: if "<用言:" in bnst.fstring and "<体言>" in parent.fstring and parent.bnst_id - bnst.bnst_id - 1 != 0: child_rep = " ".join(mrph.repname for mrph in bnst.mrph_list()) parent_rep = " ".join(mrph.repname for mrph in parent.mrph_list()) print(child_rep, "->", parent_rep, parent.bnst_id - bnst.bnst_id - 1)
import os import ptvsd import json import regex import pprint from pyknp import KNP ptvsd.enable_attach(address=("0.0.0.0", 3000)) ptvsd.wait_for_attach() ptvsd.break_into_debugger() knp = KNP() # Default is JUMAN++. If you use JUMAN, use KNP(jumanpp=False) file_name = "feedbacksheet.json" if os.path.exists(file_name): os.remove(file_name) dependency_dic = {} for index in range(2, 22): index_json = {} feature_json = {} with open("data/{}.json".format(index), "r") as f: json_data = json.load(f) history = json_data.pop("history").strip("[]") history_list = regex.findall("{(?>[^{}]+|(?R))*}", history) comment_2d_list = [] for history_str in history_list:
def init_knp(): return KNP(option='-ne-crf -tab')