コード例 #1
0
    def __init__(self):
        """Constructs a Tokenizer for Juman++.
    """
        from pyknp import Juman

        self.do_lower_case = False
        self._jumanpp = Juman()
コード例 #2
0
 def run(self):
     data = self.load()
     jumanpp = Juman()
     output = []
     for _, row in data.iterrows():
         zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True)
         splited = [
             mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list()
         ]
         if self.task_name == 'QA_B':
             qa_zenkaku = jaconv.h2z(
                 f"{row['target']}の{row['aspect']}は{row['sentiment']}",
                 ascii=True,
                 digit=True,
             )
         else:
             qa_zenkaku = " "
         qa_splited = [
             mrph.midasi
             for mrph in jumanpp.analysis(qa_zenkaku).mrph_list()
         ]
         output.append({
             "context": " ".join(splited),
             "qa": " ".join(qa_splited),
             "label": 1
         })
     self.dump(pd.DataFrame(output))
コード例 #3
0
ファイル: knp.py プロジェクト: Kazuuuuuki/utech_crawler
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='juman',
                 jumanrcfile='',
                 jumanpp=False):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = (jumancommand == "jumanpp") or jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile)
            quit(1)

        if (self.jumanpp):
            self.juman = Jumanpp()
        else:
            self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)
コード例 #4
0
def analyzer():
    bc = BertClient(ip='bertserving', output_fmt='list')
    client = Elasticsearch('elasticsearch:9200')
    texts = []
    list_text = []
    jumanpp = Juman()
    query = request.args.get('q')
    result = jumanpp.analysis(query)
    for mrph in result.mrph_list():
        texts.append(mrph.midasi)
    list_text.append(" ".join(texts))
    query_vector = bc.encode(list_text, is_tokenized=False)[0]
    script_query = {
        "script_score": {
            "query": {
                "match": {
                    "source": "tb"
                }
            },
            "script": {
                "source":
                "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }

    response = client.search(index=INDEX_NAME,
                             body={
                                 "size": SEARCH_SIZE,
                                 "query": script_query
                             })
    return jsonify(response)
コード例 #5
0
ファイル: knp.py プロジェクト: kzinmr/pyknp-extend
    def __init__(self, command='knp', option='-tab', rcfile='',
                 server=None, port=31000, timeout=30,
                 pattern=r'(?:^|\n)EOS($|\n)',
                 jumanrcfile='', juman_option='-e2 -B', juman_port=32000,
                 juman_command='juman', jumanpp=False):

        self.use_jumanpp = (juman_command == "jumanpp") or jumanpp
        assert 'EOS' in pattern
        self.pattern = pattern
        self.EOS = 'EOS'
        # tab形式しかパースしない
        assert '-tab' in option

        if rcfile and not os.path.isfile(os.path.expanduser(rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile)
            quit(1)

        # Setup Juman(++)
        assert port != juman_port
        juman_args = {'option': juman_option, 'rcfile': jumanrcfile,
                      'server':server, 'port':juman_port}
        if self.use_jumanpp:
            self.juman = Jumanpp(**juman_args)
        else:
            self.juman = Juman(**juman_args)
        # Setup KNP
        if server is not None:
            self.socket = Socket(server, port, option=option, timeout=timeout)
            self.query = partial(self.socket.query, pattern=pattern)
        else:
            if rcfile:
                option += " -r {}".format(rcfile)
            self.subprocess = Subprocess(command, option=option)
            self.query = partial(self.subprocess.query, pattern=pattern)
コード例 #6
0
ファイル: knp.py プロジェクト: matsurih/pyknp
    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='jumanpp',
                 jumanrcfile='',
                 jumanoption='',
                 jumanpp=True):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.options = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           option=jumanoption,
                           jumanpp=self.jumanpp)
コード例 #7
0
 def _apply_jumanpp(self, inp: str) -> Tuple[str, str]:
     jumanpp = Juman(command=self.juman, option=self.juman_option)
     jumanpp_result = jumanpp.analysis(inp)
     jumanpp_out = jumanpp_result.spec() + 'EOS\n'
     jumanpp_conll_out = self._jumanpp2conll_one_sentence(
         jumanpp_out) + 'EOS\n'
     return jumanpp_out, jumanpp_conll_out
コード例 #8
0
def main(DATA_ROOT):
    text_files = Path(DATA_ROOT).glob('**/*.txt')
    for text_file in text_files:
        with open(text_file) as f:
            content = f.read()

        content = re.sub(r"=+(.*?)=+", "\g<1>", content)
        content = re.sub(r"^\n", "", content, flags=re.MULTILINE)
        content = content.replace('<block>', '')
        content = content.replace('<math-element>', '')
        # In this case, 。 can be removed safely
        sentences = re.split(r"[。\n]", content)
        sentences = [line for line in sentences if len(line) != 0]
        sentences = [''.join(line.split()) for line in sentences]

        # Remove sentence which is not properly parsed
        val_sentences = []
        offsets = []

        juman = Juman()

        for sentence in tqdm(sentences):
            # Try to parse
            try:
                result = juman.analysis(sentence)

            except ValueError:
                print(sentence)

            except Exception as e:
                raise e

            current = 0
            offset = [0 for _ in range(len(sentence))]

            for mrph in result.mrph_list():
                current = current + len(mrph.midasi)
                try:
                    offset[current - 1] = 1

                except IndexError as e:
                    print(sentence)
                    print(current)
                    for _mrph in result.mrph_list():
                        print(_mrph.midasi)
                    raise e

                except Exception as e:
                    raise e

            val_sentences.append(sentence)
            offsets.append(offset)

        results = (sentences, offsets)

        file_name = text_file.name[:-4] + '.pickle'
        dic = text_file.parent

        with open(Path(dic, file_name), 'wb') as f:
            pickle.dump(results, f)
コード例 #9
0
def main(bert_vocab_filepath, example_filepath, context_filepath,
         cache_save_dir):
    #Juman++
    juman = Juman(jumanpp=True)

    logger.info("Cache files will be saved in {}.".format(cache_save_dir))

    #Tokenizer
    logger.info("Create a tokenizer from {}.".format(bert_vocab_filepath))
    tokenizer = BertTokenizer.from_pretrained(bert_vocab_filepath,
                                              do_lower_case=False)

    logger.info("Start loading examples from {}.".format(example_filepath))
    examples = load_examples(example_filepath)
    logger.info("Finished loading examples.")
    logger.info("Number of examples: {}".format(len(examples)))

    logger.info("Start loading contexts from {}.".format(context_filepath))
    contexts = load_contexts(context_filepath)
    logger.info("Finished loading contexts.")

    logger.info("Start encoding examples.")
    encoding = encode_examples(juman, tokenizer, examples, contexts, 512)
    logger.info("Finished encoding examples.")

    os.makedirs(cache_save_dir, exist_ok=True)
    torch.save(encoding["input_ids"],
               os.path.join(cache_save_dir, "input_ids.pt"))
    torch.save(encoding["attention_mask"],
               os.path.join(cache_save_dir, "attention_mask.pt"))
    torch.save(encoding["token_type_ids"],
               os.path.join(cache_save_dir, "token_type_ids.pt"))
    torch.save(encoding["labels"], os.path.join(cache_save_dir, "labels.pt"))
    logger.info("Saved cache files in {}.".format(cache_save_dir))
コード例 #10
0
    def string_word_point(self, df):
        jumanpp = Juman(jumanpp=False)
        tmp_word =[]
        df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf
        df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf
        df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf
        df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf
        df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf
        df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf
        
        #print(df_word_point)
        for i in range(len(df)):
             #URLだったら追加
            url=URL_hanbetu(df['comment'][i])
            if url != False:
                tmp = self.my_index(df_URL_point['URL'],url)
                df_URL_point = self.make_df_append(df_URL_point,tmp,url)

            #print("記号削除前")
            #print(df_word_point)
            #記号削除中
            print(df['comment'][i])
            df['comment'][i] = self.my_delete(df['comment'][i])
            # h:m:s -> hms に変更
            tmp_time = self.strtime_to_inttime(df['time'][i])
                    
            #時間ごとのコメント数計算
            tmp = self.my_index(df_time_point['time'],tmp_time)
            df_time_point = self.make_df_append(df_time_point,tmp,tmp_time)
            #wwwがあったら1追加なかったら0追加
            print(url)
            if False != self.www_hanbetu(df['comment'][i]) and url == False:
                df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)
            #拍手があったら1追加なかったら0追加
            if False != self.hakusyu_hanbetu(df['comment'][i]):
                df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time)
            else:
                if False == tmp :
                    df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True)

                #構文解析
                result = jumanpp.analysis(df['comment'][i])
                #print(result)
                #分析結果からdf作成
                for token in result.mrph_list():
                    tmp_word = token.midasi   
                #名詞の出現数計算
                    if 0 != self.word_Classification(token.hinsi):
                    #名詞なら
                        if self.word_Classification(token.hinsi) == '名詞':    
                            tmp = self.my_index(df_word_point['word'],tmp_word)
                            df_word_point = self.make_df_append(df_word_point,tmp,tmp_word)
                        #名詞とその時の時間
                            df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True)

        return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
コード例 #11
0
 def morphological_analysis(self, text):
     jumanpp = Juman()
     ret = []
     text = self.remove_special_character(text)
     result = jumanpp.analysis(text)  # これでスペースで単語が区切られる
     for mrph in result.mrph_list():
         ret += self.modification(mrph.midasi)
     return ret
コード例 #12
0
    def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str):
        self.jumanpp = Juman(command=jumanpp_command)

        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
        self.model = BertPosNegClassifier(bert_model)
        state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu'))
        self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()})
        self.model.eval()
コード例 #13
0
ファイル: frequency.py プロジェクト: ssabcire/NLP
def counter(text, d):
    jumanapp = Juman()
    result = jumanapp.analysis(text)
    for mrph in result.mrph_list():
        if mrph.genkei in d:
            d[mrph.genkei] = d[mrph.genkei] + 1
        else:
            d[mrph.genkei] = 1
コード例 #14
0
 def __init__(
     self,
     preprocessor=None,
     stopwords=[],
 ):
     self.jumanpp = Juman()
     self.preprocessor = preprocessor
     self.stopwords = stopwords
コード例 #15
0
    def test_juman_wrapper(self):
        juman = Juman()
        result = juman.analysis(u"これはペンです。")
        print(','.join(mrph.midasi for mrph in result))

        for mrph in result.mrph_list():
            assert isinstance(mrph, pyknp.Morpheme)
            print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
コード例 #16
0
    def __init__(self):
        """
        日本語専用トークナイザの構築。
        JUMAN++ を使用する。
   """
        from pyknp import Juman

        self.do_lower_case = False
        self._jumanpp = Juman()
コード例 #17
0
def juman_list(text):
    jumanpp = Juman()
    result = jumanpp.analysis(text)
    # アルファベットは全て "En" という文字列に置き換える
    wakati = [
        mrph.genkei if mrph.bunrui != "アルファベット" else "En"
        for mrph in result.mrph_list()
    ]
    return ",".join(wakati)
コード例 #18
0
def _morphological_analysis(tweet: str) -> List[str]:
    '''
    tweetを形態素解析し、リストを返す
    '''
    text = _remove_unnecessary(tweet)
    if not text:
        return []
    return [mrph.genkei for mrph in Juman().analysis(text).mrph_list()
            if mrph.hinsi in ['名詞', '動詞', '形容詞', '接尾辞']]
コード例 #19
0
ファイル: test.py プロジェクト: tennmoku71/kotodama
    def test_unknown_word(self):

        # disable_error実行前はエラーが発生する
        with self.assertRaises(KeyError):
            message = kotodama.transformVerb("嫌いだ",{"過去"})

        kotodama.disableError(Juman())

        # print(kotodama.transformVerb("嫌いだ",set()))
        self.assertEqual(kotodama.transformVerb("嫌いだ",set("過去")), '嫌いだ')
コード例 #20
0
ファイル: add_sems.py プロジェクト: shirayu/kyoto-reader
def get_repname_using_jumanpp(genkei: str, pos: str) -> str:
    if pos == '助詞':
        return f'{genkei}/{genkei}'

    juman = Juman(option='-s 1')
    mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE)
    # 形態素解析が誤っていないか(=1形態素になっているか)をチェック
    if len(mrphs) == 1:
        return mrphs[0].repname

    return f'{genkei}/{genkei}'
コード例 #21
0
    def __init__(self):
        '''initialize

        Examples
        --------
        >>> nlp = JNLP()
        None

        '''

        self.juman = Juman()
        self.KNP = KNP(option='-tab -anaphora')
コード例 #22
0
    def test_juman_wrapper(self):
        try:
            juman = Juman(command=self.path_to_juman_command)
            result = juman.analysis("これはペンです。")
            logger.debug(','.join(mrph.midasi for mrph in result))

            for mrph in result.mrph_list():
                assert isinstance(mrph, pyknp.Morpheme)
                logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                      % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
        except ImportError:
            print('skip test_juman_wrapper')
コード例 #23
0
def seg2word(seg):
    len_split = 1000
    # seg = seg_in.replace(' ', '\u3000')
    # seg = seg_in.replace(' ', ' ')
    len_seg = len(seg)
    seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)]

    juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp")
    return ' '.join([
        " ".join(
            [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()])
        for seg_part in seg_splits
    ])
コード例 #24
0
def bulk_predict(docs, batch_size=256):
    """Predict bert embeddings."""
    jumanpp = Juman(jumanpp=False)
    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i: i+batch_size]
        pre_embedding_docs = []
        for doc in batch_docs:
            for k in range(0, len(doc['question']), MAX_TXT_LENGTH)
                result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH])
                texts = [mrph.midasi for mrph in result.mrph_list()]
                pre_embedding_docs.append(" ".join(texts))
        embeddings = bc.encode(pre_embedding_docs,is_tokenized=True)
            yield emb
コード例 #25
0
ファイル: bow_juman.py プロジェクト: tuxedocat/ntcir13-medweb
def parser_func_juman(lemmatize: bool = True) -> Callable[[str], List[str]]:
    juman = Juman()
    if lemmatize:

        def f(s: str) -> List[str]:
            return [m.genkei for m in juman.analysis(s)]

        return f
    else:

        def g(s: str) -> List[str]:
            return [m.midasi for m in juman.analysis(s)]

        return g
コード例 #26
0
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False):
    juman = Juman()
    output = ""
    # wakati
    result = juman.analysis(text)
    for mrph in result.mrph_list():
        if STEM_FLAG and mrph.hinsi in hinshi:
            output += mrph.repname.split("/")[0] + " "
        if DEBUG:
            print("stem:", mrph.repname)
            print("midashi:", mrph.repname)
            print("hinsi:", mrph.hinsi)
            print("yomi:", mrph.yomi)
    return output.strip()
コード例 #27
0
    def __init__(
        self,
        cls: Type["Defaults"],
        nlp: Optional[Language] = None,
        juman_kwargs: Optional[Dict[str, str]] = None,
        preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize,
    ):
        """

        Args:
            juman_kwargs: passed to `pyknp.Juman.__init__`
            preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used.
        """
        from pyknp import Juman

        juman_kwargs = juman_kwargs or {}
        default_command = get_juman_command()
        assert default_command
        juman_kwargs.setdefault("command", default_command)

        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman()
        self.juman_kwargs = juman_kwargs
        self.preprocessor = preprocessor
コード例 #28
0
ファイル: knp.py プロジェクト: ku-nlp/pyknp
    def __init__(
        self,
        command='knp',
        server=None,
        port=31000,
        timeout=60,
        option='-tab',
        rcfile='',
        pattern=r'EOS',
        jumancommand='jumanpp',
        jumanrcfile='',
        jumanoption='',
        jumanpp=True,
        multithreading=False,
    ):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.options = option.split()
        self.rcfile = rcfile
        self.pattern = pattern
        if server is not None:
            self.analyzer = Analyzer(backend='socket',
                                     timeout=timeout,
                                     server=server,
                                     port=port,
                                     socket_option='RUN -tab -normal\n')
        else:
            cmds = [self.command] + self.options
            if self.rcfile:
                cmds += ['-r', self.rcfile]
            self.analyzer = Analyzer(backend='subprocess',
                                     multithreading=multithreading,
                                     timeout=timeout,
                                     command=cmds)
        self.jumanpp = jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            raise Exception("Can't read rcfile (%s)!" % self.rcfile)
        if distutils.spawn.find_executable(self.command) is None:
            raise Exception("Can't find KNP command: %s" % self.command)

        self.juman = Juman(command=jumancommand,
                           rcfile=jumanrcfile,
                           option=jumanoption,
                           jumanpp=self.jumanpp,
                           multithreading=multithreading)
コード例 #29
0
def word_distance(s1, s2):
    juman = Juman()
    r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2))
    if r > len((s1 + s2).replace(" ", "")) // 2:
        return word_distance_en(s1, s2)
    sss = [
            set(
                [item.midasi for item in juman.analysis(ss).mrph_list() \
                        if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\
                        or '内容語' in item.imis
                        ]
            ) for ss in [s1, s2]
            ]
    if min(len(sss[0]), len(sss[1])) == 0:
        return 0
    return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
コード例 #30
0
def title_clean(title_ls):
    tmp_ls = copy.deepcopy([title_ls])
    for i in range(len(tmp_ls) - 1):
        if tmp_ls[i] is None:
            del tmp_ls[i]
    for i in range(len(tmp_ls)):        
        tmp_ls[i] = normalize('NFKC', tmp_ls[i])
        tmp_ls[i] = tmp_ls[i].replace(' ', '')
        tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i])
        tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i])
    jumanpp = Juman()
    sep_ls = []
    for tmp in tmp_ls: 
        sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)]))
    return sep_ls[0]