Exemple #1
0
    def _apply_knp(self, sent: str) -> str:
        self.logger.info(f'parse sentence: {sent}')
        knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_dpnd_option)
        knp_result = knp.parse(sent)

        if self.remote_knp is True:
            _, jumanpp_conll_out = self._apply_jumanpp(sent)
            clientsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self.logger.info(f'connect to {self.knp_host}:{self.knp_port}')
            clientsock.connect((self.knp_host, self.knp_port))
            clientsock.sendall(jumanpp_conll_out.encode('utf-8'))

            buf = []
            while True:
                data = clientsock.recv(8192)
                data_utf8 = data.decode('utf-8')
                buf.append(data_utf8)
                if data_utf8.endswith('EOS\n'):
                    break
            clientsock.close()
            conllu_out = ''.join(buf)
            self.logger.info(f'received {len(conllu_out)} chars from remote KNP')

            # modify KNP result by conllu result of remote KNP
            head_ids, dpnd_types = self._read_conllu_from_buf(conllu_out)
            self._modify_knp(knp_result, head_ids, dpnd_types)

        # add predicate-argument structures by KNP
        knp = KNP(command=self.knp, jumancommand=self.juman, option=self.knp_case_option)
        knp_result_new = knp.parse_juman_result(knp_result.spec())
        return knp_result_new.spec()
def generate_knowledge(sentence):
    ##knpで解析
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(sentence.replace(" ", ""))
    bnst_list = result.bnst_list()

    #文節辞書
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    infos = []
    info = dict((x, None) for x in info_elements)
    for bnst in bnst_list:
        place = get_gimonshi(bnst, bnst_dic)

        if (place == None):
            pass

        elif info[place] == None:
            info[place] = select_normalization_representative_notation(
                bnst.fstring)

        else:
            infos.append(info)
            del info
            info = dict((x, None) for x in info_elements)
            info[place] = select_normalization_representative_notation(
                bnst.fstring)

    return infos
Exemple #3
0
 def __init__(
     self,
     dataset: PASDataset,
     logger: Logger,
     use_knp_overt: bool = True,
 ) -> None:
     self.examples: List[PasExample] = dataset.examples
     self.cases: List[str] = dataset.target_cases
     self.bridging: bool = dataset.bridging
     self.coreference: bool = dataset.coreference
     self.relations: List[str] = dataset.target_cases + (
         ['ノ'] * self.bridging) + (['='] * self.coreference)
     self.exophors: List[str] = dataset.target_exophors
     self.index_to_special: Dict[int, str] = {
         idx: token
         for token, idx in dataset.special_to_index.items()
     }
     self.documents: List[Document] = dataset.documents
     self.logger = logger
     self.use_knp_overt = use_knp_overt
     self.kc: bool = dataset.kc
     self.reader = dataset.reader
     cfg = configparser.ConfigParser()
     cfg.read(Path(__file__).parent.parent / 'analyzer' / 'config.ini')
     if 'default' not in cfg:
         logger.warning(
             'Analyzer config not found. Instead, use default values.')
         cfg['default'] = {}
     section = cfg['default']
     knp_command = section.get('knp_command', shutil.which('knp'))
     jumanpp_command = section.get('juman_command', shutil.which('jumanpp'))
     self.knp = KNP(command=knp_command,
                    option='-tab -case2',
                    jumancommand=jumanpp_command)
Exemple #4
0
    def __init__(self):
        self._knp = KNP()

        self._valid_nominative_strings = \
            self._load_file('valid_nominative_strings.txt')
        self._valid_nominative_semantic_markers = \
            self._load_file('valid_nominative_semantic_markers.txt')
        self._volition_modalities = \
            self._load_file('volition_modalities.txt')
        self._volition_voices = \
            self._load_file('volition_voices.txt')
        self._non_volition_voices = \
            self._load_file('non_volition_voices.txt')
        self._volition_adverb_repnames = \
            self._load_file('volition_adverb_repnames.txt')
        self._non_volition_adverb_repnames = \
            self._load_file('non_volition_adverb_repnames.txt')
        self._valid_adjective_predicate_suffix_repnames = \
            self._load_file('valid_adjective_predicate_suffix_repnames.txt')
        self._non_volition_verbal_suffix_semantic_labels = \
            self._load_file('non_volition_verbal_suffix_semantic_labels.txt')
        self._non_volition_verbal_suffix_repnames = \
            self._load_file('non_volition_verbal_suffix_repnames.txt')
        self._non_volition_types = \
            self._load_file('non_volition_types.txt')
        self._non_volition_head_repnames = \
            self._load_file('non_volition_head_repnames.txt')
        self._non_volition_semantic_labels = \
            self._load_file('non_volition_semantic_labels.txt')
def select_dependency_structure(line):
    """係り受け構造を抽出します
    """

    # KNP
    knp = KNP(option='-tab -anaphora')

    # 解析
    result = knp.parse(line)

    # 文節リスト
    bnst_list = result.bnst_list()

    # 文節リストをidによるディクショナリ化する
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    tuples = []
    for bnst in bnst_list:
        if bnst.parent_id != -1:
            # (from, to)
            print("bnst_id:{} parent_id:{}\n".format(bnst.bnst_id,
                                                     bnst.parent_id))
            tuples.append(
                (select_normalization_representative_notation(bnst.fstring),
                 select_normalization_representative_notation(
                     bnst_dic[bnst.parent_id].fstring)))

    return tuples
Exemple #6
0
def main():
    knp = KNP(jumanpp=True, option='-tab -assignf')

    parser = argparse.ArgumentParser()
    parser.add_argument("--npyfile", "-m")
    parser.add_argument("--vocabfile", "-v")
    parser.add_argument("--topk", "-k", type=int, default=5)
    parser.add_argument("--query", "-q", type=str, default='')
    parser.add_argument("--cnpyfile", "-c", type=str, default='')
    parser.add_argument("--cvocabfile", "-u", type=str, default='')
    args = parser.parse_args()

    npyfile = args.npyfile
    vocabfile = args.vocabfile
    topk = args.topk
    query = args.query
    cnpyfile = args.cnpyfile
    cvocabfile = args.cvocabfile

    w2vec = load_model(npyfile, vocabfile)
    c2vec = {}
    cvocabs = []
    if cnpyfile and cvocabfile:
        c2vec = load_model(cnpyfile, cvocabfile)

    if query:
        parse_and_print(query, knp, w2vec, topk, c2vec)
        return
    while True:
        q = input()
        parse_and_print(q, knp, w2vec, topk, c2vec)
Exemple #7
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1]  # '../dataset/mountains_ja.knp.gz'
    vocabfile = sys.argv[2]  # '../dataset/mountains_ja.vocab.gz'
    assert 'gz' in knpfile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(vocabfile, mode='wt', encoding='utf8') as ofp:
        write_vocab(knp, ifp, ofp)
def init():
    r = request
    print(r.data)
    global knp 
    knp = KNP()
    global queue
    queue = deque()

    result = {"text":"init done"}
    return jsonify(ResultSet=result)
Exemple #9
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1] # '../dataset/wikipedia.knp.gz'
    wakatifile = sys.argv[2] # '../dataset/wikipedia.deps'

    assert 'gz' in knpfile
    assert 'gz' in wakatifile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(wakatifile, mode='wt', encoding='utf8') as ofp:
        write_wakati(knp, ifp, ofp, WORDFORM.LONGEST)
Exemple #10
0
 def __init__(
     self,
     knp: Optional[KNP] = None,
     jumanpp: bool = True,
     fallback_juman: bool = True,
 ):
     self.knp = KNP(jumanpp=jumanpp) if knp is None else knp
     self.juman = self.knp.juman
     self.knp.parse("。")  # self.knp.socketやsubprocessを生成させるため
     self.fallback_juman = fallback_juman
Exemple #11
0
def tag(text: str) -> (list, list):
    '''
    return tag_ids: [(子基本句ID, 親基本句ID), ...]
    '''
    knp = KNP()
    tag_list = knp.parse(text).tag_list()
    tag_ids = list()
    for tag in tag_list:  # 各基本句へのアクセス
        if tag.parent_id != -1:
            tag_ids.append((tag.tag_id, tag.parent_id))
    return tag_list, tag_ids
Exemple #12
0
def total_chunk2(text):
    from pyknp import KNP
    knp = KNP()
    result = knp.parse(text)
    num = 0
    for bnst in result.bnst_list():
        if not "".join(mrph.midasi for mrph in bnst.mrph_list()) == "None":
            print("".join(mrph.midasi for mrph in bnst.mrph_list()))
            num += 1

    return num
Exemple #13
0
def tag(text: str) -> (list, list):
    '''
    return tag_ids: [(子基本句ID, 親基本句ID), ...]
    '''
    knp = KNP()
    tag_list = knp.parse(text).tag_list()
    tag_ids = list()
    for tag in tag_list:  # 各基本句へのアクセス
        if re.search('<格関係', tag.fstring):
            for i in re.findall(r'格関係\d', tag.fstring):
                tag_ids.append((int(re.sub(r'格関係', '', i)), tag.tag_id))
    return tag_list, tag_ids
Exemple #14
0
    def __init__(self):
        '''initialize

        Examples
        --------
        >>> nlp = JNLP()
        None

        '''

        self.juman = Juman()
        self.KNP = KNP(option='-tab -anaphora')
def extract_poems(lines: List[str], jobs: int) -> List[Tuple]:
    knp = KNP(jumanpp=True)
    chunk_size = len(lines) // jobs + 1
    arguments = [(lines[i:i + chunk_size], knp)
                 for i in range(0, len(lines), chunk_size)]
    with mp.Pool(jobs) as p:
        checked_chunks = p.starmap(_extract_poems, arguments)

    poems = []
    for chunk in checked_chunks:
        poems.extend(chunk)
    return poems
Exemple #16
0
def get_u_gimonshi(sentence):
    line = sentence.replace(" ", "")
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(line)
    bnst_list = result.bnst_list()
    bnst_dic = dict((x.bnst_id, x) for x in bnst_list)

    u_gimonshi = ""
    for bnst in bnst_list:
        place = question(bnst, bnst_dic)
        if place != None:
            u_gimonshi = place

    #print(u_gimonshi)
    return u_gimonshi
Exemple #17
0
def load_knp_from_stream(f, juman_format=JUMAN_FORMAT.DEFAULT):
    """
    KNPフォーマットの解析結果ファイルを解釈し、文節列オブジェクトを返す

    Args:
        f (file): KNPフォーマットの解析結果のファイルオブジェクト
        juman_format (JUMAN_FORMAT): Jumanのlattice出力形式

    Yields:
        BList: 文節列オブジェクト
    """
    knp = KNP()
    buf = ""
    for line in f:
        buf += line
        if line.startswith("EOS"):
            yield knp.result(buf, juman_format=juman_format)
            buf = ""
Exemple #18
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1] # './wiki_00.knp.gz'
    outdir = sys.argv[2]
    assert '.knp.gz' in knpfile
    bn = os.path.basename(knpfile)
    head = os.path.join(outdir, bn.split('.')[0])
    tffile_ab = head + '.ab.tf.gz' # './wiki_00.ab.tf.gz'
    dffile_ab = head + '.ab.df.gz' # './wiki_00.ab.df.gz'
    tffile_anob = head + '.anob.tf.gz'# './wiki_00.anob.tf.gz'
    dffile_anob = head + '.anob.df.gz'# './wiki_00.anob.df.gz'

    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
        gzip.open(tffile_ab, mode='wt', encoding='utf8') as ofp_tf_ab,\
        gzip.open(dffile_ab, mode='wt', encoding='utf8') as ofp_df_ab,\
        gzip.open(tffile_anob, mode='wt', encoding='utf8') as ofp_tf_anob,\
        gzip.open(dffile_anob, mode='wt', encoding='utf8') as ofp_df_anob:
        parse_and_write(knp, ifp, ofp_tf_anob, ofp_tf_ab, ofp_df_anob, ofp_df_ab)
Exemple #19
0
    def total_chunk(text):
        from pyknp import KNP
        knp = KNP()

        sentences = []
        num = 0
        for stc in text.split("。"):
            if not stc == "":
                try:
                    result = knp.parse(stc)
                    for bnst in result.bnst_list():
                        if not "".join(mrph.midasi
                                       for mrph in bnst.mrph_list()) == "None":
                            num += 1
                except Exception:
                    pass

        return num
    def get_context_words(self, sentence_size_limit=100):
        knp = KNP()
        knp_extractor = KNP_extractor(self.config.knp_index_db, self.config.knp_parent_dir, self.config.knp_sub_index_length)
        context_words = Counter()
        for index, sent_tuple in enumerate(self.sents[:sentence_size_limit]):
            sid = sent_tuple.sid.split('%')[0]
            sup_knp = knp_extractor.get_knp(sid)
            if not sup_knp:
                sys.stderr.write("fail to convert knp of %s.\n" % sid)
                continue

            try:
                result = knp.result(sup_knp.decode('utf-8'))
                context_words.update(self._get_sentence_args(result))
            except:
                sys.stderr.write("fail to convert knp of %s.\n" % sid)
        
        context_words = dict(context_words)
        return context_words
Exemple #21
0
def read_knp_result_file(filename: str) -> List[BList]:
    """Read a KNP result file.

    Args:
        filename: A filename.

    Returns:
        A list of :class:`pyknp.knp.blist.BList` objects.
    """
    knp = KNP()
    blists = []
    with open(filename, "rt", encoding="utf-8", errors="replace") as f:
        chunk = ""
        for line in f:
            chunk += line
            if line.strip() == "EOS":
                blists.append(knp.result(chunk))
                chunk = ""
    return blists
    def __init__(
            self, word2vec_model: Word2VecModel,
            juman_command: str='jumanpp',
            specific_parts: Optional[List[str]]=None
    ) -> None:
        if specific_parts is None:
            specific_parts = ['普通名詞']

        if juman_command == 'juman':
            self.juman: Union[Juman, Jumanpp] = Juman()
        elif juman_command == 'jumanpp':
            self.juman: Union[Juman, Jumanpp] = Jumanpp()
        else:
            raise AttributeError
        self.knp: KNP = KNP(jumancommand=juman_command)

        self.specific_parts: List[str] = specific_parts

        self.word2vec: Word2VecModel = word2vec_model
Exemple #23
0
def main():
    knp = KNP(jumanpp=True, option='-tab')
    knpfile = sys.argv[1]  # '../dataset/wikipedia.knp.gz'
    vocabfile = sys.argv[2]  # '../dataset/wikipedia.vocab'
    depsfile = sys.argv[3]  # '../dataset/wikipedia.deps'

    # 全単語集合に関する頻度足切り
    vocab_thre = 100
    assert 'gz' in vocabfile
    with gzip.open(vocabfile, mode='rt', encoding='utf8') as ifp:
        vocab = read_vocab(ifp, vocab_thre)

    # extract dependency pairs from a knp parsed file.
    # CoNLL: tokens = [(id,form, head,deprel)]
    assert 'gz' in knpfile
    assert 'gz' in depsfile
    with gzip.open(knpfile, mode='rt', encoding='utf8', errors='ignore') as ifp,\
         gzip.open(depsfile, mode='wt', encoding='utf8') as ofp:
        write_deps(knp, vocab, ifp, ofp)
Exemple #24
0
def generate_utterance(u_sen, all_infos):
    #何を聞かれているかを判断
    question = get_u_gimonshi(u_sen)

    #質問の答えがどこにあるかを検索
    knp = KNP(option='-tab -anaphora')
    result = knp.parse(u_sen.replace(" ", ""))
    bnst_list = result.bnst_list()
    search_words = []
    for bnst in bnst_list:
        search_words.append(
            create_infos.select_normalization_representative_notation(
                bnst.fstring))

    search_point = -1
    for search_word in search_words:
        for i, info in enumerate(all_infos):
            if (search_word in info.values()):
                search_point = i

    answer = ""
    if search_point == -1:
        answer = "No information"

    else:
        if all_infos[search_point][question] != None:  #質問の答えがその場所にあるとき
            answer = all_infos[search_point][question] + "です"
        else:  #質問の答えがその場所にないとき上下の情報を探索
            if search_point == 0:
                if all_infos[search_point + 1][question] != None:
                    answer = all_infos[search_point + 1][question] + "です"
                else:
                    answer = "No information"
            else:
                if all_infos[search_point - 1][question] != None:
                    answer = all_infos[search_point - 1][question] + "です"
                elif all_infos[search_point + 1][question] != None:
                    answer = all_infos[search_point + 1][question] + "です"
                else:
                    answer = "No information"

    return answer
Exemple #25
0
 def __init__(self, text: str, delimiter: str = '\n'):
     self.text = text
     self.delimiter = delimiter
     self.sentences = util.split_text(self.text, delimiter)
     self.n_sentences = len(self.sentences)
     self.knp = KNP(option=DefaultOptions.KNP, jumanpp=False)
     self.trees = self._trees()
     self.juman = Juman(jumanpp=False)
     self.rs_pos = self.calc_rs_pos()
     self.n_mrphs = self.calc_n_mrphs()
     self.n_chunks = self.calc_n_chunks()
     self.n_types = self.calc_n_types()
     self.mean_n_mrphs = None \
         if self.n_sentences == 0 \
         else self.n_mrphs / self.n_sentences
     self.rs_modality = self.calc_rs_modality()
     self.r_conditional = None \
         if self.n_sentences == 0 \
         else self.calc_n_conditionals() / self.n_sentences
     self.mean_tree_depths = self.calc_mean_tree_depths()
Exemple #26
0
def evg():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", "-o", default="", help="path to output")
    args = parser.parse_args()

    basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    knp = KNP()
    results = []
    chunk = ""
    for line in codecs.getreader("utf-8")(getattr(sys.stdin, "buffer",
                                                  sys.stdin)):
        chunk += line
        if line.strip() == "EOS":
            results.append(knp.result(chunk))
            chunk = ""
    evg_ = EventGraph.build(results)
    if args.output:
        evg_.save(args.output)
    else:
        print(json.dumps(evg_.to_dict(), indent=4, ensure_ascii=False))
Exemple #27
0
def test():
    # ex.)echo "私は自然言語処理の研究をする"  | juman | knp -tab -dpnd | python DependencyParser.py

    import codecs
    sys.stdin  = codecs.getreader('UTF-8')(sys.stdin)
    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)

    knp = KNP()
    data = u""

    for line in iter(sys.stdin.readline, ""):
        data += line
        if line.strip() == u"EOS":
            result = knp.result(data)
            DB = parseDependency(result.bnst_list(), head=False)
            DBhead = parseDependency(result.bnst_list(), head=True)
            print "parent-child"
            # for bnstrep in DB:
                # print bnstrep
            for bnstrep in DBhead:
                print bnstrep
            data = u""
Exemple #28
0
from pyknp import KNP

knp = KNP(jumanpp=True)
result = knp.parse("望遠鏡で泳いでいる女の子を見た。")

for bnst in result.bnst_list():
    parent = bnst.parent
    if parent is not None:
        if "<用言:" in bnst.fstring and "<体言>" in parent.fstring and parent.bnst_id - bnst.bnst_id - 1 != 0:
            child_rep = " ".join(mrph.repname for mrph in bnst.mrph_list())
            parent_rep = " ".join(mrph.repname for mrph in parent.mrph_list())
            print(child_rep, "->", parent_rep,
                  parent.bnst_id - bnst.bnst_id - 1)
Exemple #29
0
import os
import ptvsd
import json
import regex
import pprint

from pyknp import KNP

ptvsd.enable_attach(address=("0.0.0.0", 3000))
ptvsd.wait_for_attach()
ptvsd.break_into_debugger()

knp = KNP()     # Default is JUMAN++. If you use JUMAN, use KNP(jumanpp=False)

file_name = "feedbacksheet.json"

if os.path.exists(file_name):
    os.remove(file_name)

dependency_dic = {}
for index in range(2, 22):
    index_json = {}
    feature_json = {}
    with open("data/{}.json".format(index), "r") as f:
        json_data = json.load(f)

    history = json_data.pop("history").strip("[]")
    history_list = regex.findall("{(?>[^{}]+|(?R))*}", history)

    comment_2d_list = []
    for history_str in history_list:
def init_knp():
    return KNP(option='-ne-crf -tab')