Python Jumanppの例、pyknp.Jumanpp Pythonの例

コード例 #1

0

ファイルを表示

def get_keitaiso_list_from_juman(text):
    """
    textを形態素解析して返す
    mecabでできない表記揺れの問題をjumanだと解決できる
    """

    jumanpp = Jumanpp()
    keitaiso_list = []
    hinshi_list = []
    exclusive_word_list = get_exclusive_word_list()

    # スペースがあるとエラー。先頭に#があると処理が動かなくなる(なんでだろう)
    text = text.replace(" ", "").replace("　", "").replace("#", "/")

    result = jumanpp.analysis(unicode(text, 'utf-8')) # pyknp-Jumanではユニコード文字列しか処理されない
    try:
        for mrph in result.mrph_list():
            keitaiso = mrph.genkei.encode('utf-8')
            hinshi = mrph.hinsi.encode('utf-8')
            # 形態素が設定した品詞リストやゴミワードリストに含まれるとき、数字のときにはスキップ
            if not is_valid_word_class(hinshi) or keitaiso in exclusive_word_list or keitaiso.isdigit():
                continue

            keitaiso_list.append(keitaiso)
            hinshi_list.append(hinshi)
    except:
        print traceback.print_exc()

    return [keitaiso_list, hinshi_list]

コード例 #2

0

ファイルを表示

def create_gensim_dictionary(data_path, no_below=2, no_above=0.1):

    for root, dirs, files in os.walk(data_path):
        print("# morphological analysis")
        docs = {}
        docs_title = {}
        for docname in files:
            docs[docname] = []
            with open(os.path.join(data_path, docname), "r") as f:
                lines = f.readlines()
                docs_title[docname] = lines[0]
                for text in lines:
                    text_replace = text.replace(" ",
                                                "").replace("\n", "").replace(
                                                    "#", "").replace("@", "")
                    if text_replace != "":
                        result = Jumanpp().analysis(text_replace)
                    for mrph in result.mrph_list():
                        if len(mrph.midasi) > 1:
                            docs[docname].append(mrph.midasi)

    dictionary = gensim.corpora.Dictionary(docs.values())
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)

    return docs, docs_title, dictionary

コード例 #3

0

ファイルを表示

ファイル: word_counter.py プロジェクト: Edfood/WordCounter

def main():
    if len(sys.argv) != 2:
        print('need one argument for a file.')
        return

    file_name = sys.argv[1]
    vocab_dict = defaultdict(int)
    juman = Jumanpp()

    with open(file_name, 'r', encoding='utf-8', newline='') as fr:
        text = fr.readlines()

        for line in text:
            # juman++ doesn't support half-width character
            line = line.replace(' ', '　')
            line.translate(
                str.maketrans(
                    {chr(0x0021 + i): chr(0xFF01 + i)
                     for i in range(94)}))
            analysis = juman.analysis(line.replace('\n', ''))
            for m in analysis.mrph_list():
                vocab_dict[str(m.midasi)] += 1

    sorted_dict = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)

    print(sorted_dict)
    print(len(sorted_dict))

コード例 #4

0

ファイルを表示

ファイル: aggregate.py プロジェクト: hkiyomaru/japanese-word-aggregation

def append_repname(words):
    """

    :param words: a list of Word instances
    :return: a list of Word instances with preprocessed words
             with the representative expressions

    """
    n_word = len(words)
    juman = Jumanpp()
    bar = progressbar.ProgressBar()
    for i in bar(range(n_word), max_value=n_word):
        word = words[i]

        if word.uid != i:
            continue  # already merged

        repname_set = []
        r = juman.analysis(word.p_surface)
        for mrph in r.mrph_list():
            if mrph.bunrui == '数詞':
                repname_set.append([kansuji2arabic(mrph.midasi)])
            elif mrph.repnames() != '':
                repname_set.append(mrph.repnames().split('?'))
            else:
                repname_set.append([mrph.midasi])
        words[i].alias.extend(expand_ambiguity(repname_set))
    return words

コード例 #5

0

ファイルを表示

def read_and_anlyze_text():
    sys.stdin = codecs.getreader('utf_8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    jumanpp = Jumanpp()
    midasis = []
    repnames = []
    repname_counts = {}
    wikipedia_redirections = []
    w_rs = []
    w_r_counts = {}
    row_result = []
    while True:
        input_ = sys.stdin.readline()
        if input_ == '':
            break
        else:
            input_ = input_.strip()
            if input_ == '':
                continue
            result = jumanpp.analysis(input_)
            for mrph in result.mrph_list():
                if not repname_counts.has_key(mrph.repname):
                    repname_counts[mrph.repname] = 0
                if (not mrph.midasi in midasis) and (mrph.repname != u""):
                    repname_counts[mrph.repname] += 1
                w_r = get_wikipedia_redirection(mrph.imis)
                if not w_r:
                    w_r = mrph.midasi
                if not w_r_counts.has_key(w_r):
                    w_r_counts[w_r] = 0
                if (not mrph.midasi in midasis):
                    w_r_counts[w_r] += 1
                midasis.append(mrph.midasi)
                repnames.append(mrph.repname)
                wikipedia_redirections.append(w_r)
                w_rs.append(w_r)
            midasis.append("\n")
            repnames.append("\n")
            wikipedia_redirections.append(None)
            w_rs.append("\n")
            repname_counts["\n"] = 0
            w_r_counts["\n"] = 0
            row_result.append(result.spec())

    yure_result = []
    for i, midasi in enumerate(midasis):
        yure = False
        if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1:
            yure = True
        yure_result.append({
            "midasi": midasi,
            "repname": repnames[i],
            "wikipedia_redirection": wikipedia_redirections[i],
            "repname_count": repname_counts[repnames[i]],
            "w_r_count": w_r_counts[w_rs[i]],
            "yure": yure
        })
    return row_result, yure_result

コード例 #6

0

ファイルを表示

ファイル: basic_nlp.py プロジェクト: memicq/FeedAutomator

    def __init__(self):
        self.stop_path = str(pathlib.Path(
            __file__).resolve().parent) + '/data/stopwords_slothlib.txt'
        self.stopwords = []
        with open(self.stop_path, 'r') as f:
            self.stopwords = f.read().split()

        # 形態素解析
        self.jumanpp = Jumanpp()

コード例 #7

0

ファイルを表示

ファイル: brain.py プロジェクト: meltyknife/HisuiBot

 def analysis_text(self, text, debug=None):
     jumanpp = Jumanpp()
     #There may be unknown error in jumanpp. what...
     try:
         result = jumanpp.analysis(text)
     except:
         return None
     if debug: self.__print_analyzed(result)
     return result

コード例 #8

0

ファイルを表示

ファイル: hyoki_yure.py プロジェクト: lumbermill/takachiho

def read_and_anlyze_text():
    sys.stdin = codecs.getreader('utf_8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
    jumanpp = Jumanpp()
    midasis = []
    repnames = []
    repname_counts = {}
    wikipedia_redirections = []
    w_rs = []
    w_r_counts = {}
    row_result = []
    while True:
        input_ = sys.stdin.readline()
        if input_ == '' :
            break
        else :
            input_ = input_.strip()
            if input_ == '' :
                continue
            result = jumanpp.analysis(input_)
            for mrph in result.mrph_list():
                if not repname_counts.has_key(mrph.repname):
                    repname_counts[mrph.repname] = 0
                if (not mrph.midasi in midasis) and (mrph.repname != u"") :
                    repname_counts[mrph.repname] += 1
                w_r = get_wikipedia_redirection(mrph.imis)
                if not w_r :
                    w_r = mrph.midasi
                if not w_r_counts.has_key(w_r):
                    w_r_counts[w_r] = 0
                if (not mrph.midasi in midasis):
                    w_r_counts[w_r] += 1
                midasis.append(mrph.midasi)
                repnames.append(mrph.repname)
                wikipedia_redirections.append(w_r)
                w_rs.append(w_r)
            midasis.append("\n")
            repnames.append("\n")
            wikipedia_redirections.append(None)
            w_rs.append("\n")
            repname_counts["\n"] = 0
            w_r_counts["\n"] = 0
            row_result.append(result.spec())

    yure_result = []
    for i, midasi in enumerate(midasis):
        yure = False
        if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1:
            yure = True
        yure_result.append({"midasi":midasi,
            "repname": repnames[i],
            "wikipedia_redirection": wikipedia_redirections[i],
            "repname_count": repname_counts[repnames[i]],
            "w_r_count": w_r_counts[w_rs[i]],
            "yure": yure})
    return row_result, yure_result

コード例 #9

0

ファイルを表示

ファイル: segment_surfaceText_ja.py プロジェクト: notani/CLKP-MTKBC

def segment(texts):
    jumanpp = Jumanpp()
    results = {}
    for text in texts:
        try:
            parsed = jumanpp.analysis(han_to_zen(neologdn.normalize(text)))
            segmented = ' '.join(m.midasi for m in parsed.mrph_list())
            results[text] = segmented
        except Exception:
            pdb.set_trace()
            logger.warning('Cannot parse {}'.format(text))
            continue
    return results

コード例 #10

0

ファイルを表示

def parser_juman(text):
    from pyknp import Jumanpp
    jumanpp = Jumanpp()

    result = jumanpp.analysis(text)
    words = []

    for n in result.mrph_list():
        if n.hinsi != '助詞' and n.hinsi != '助動詞' and n.hinsi != '特殊' and n.bunrui != "空白":
            if n.hinsi == '動詞':
                words.append(n.genkei)
            else:
                words.append(n.midasi)
    return words

コード例 #11

0

ファイルを表示

ファイル: basic_nlp.py プロジェクト: memicq/FeedAutomator

class MorphAnalysis:
    def __init__(self):
        self.stop_path = str(pathlib.Path(
            __file__).resolve().parent) + '/data/stopwords_slothlib.txt'
        self.stopwords = []
        with open(self.stop_path, 'r') as f:
            self.stopwords = f.read().split()

        # 形態素解析
        self.jumanpp = Jumanpp()

    def to_wakati(self,
                  text,
                  allow_word_class=[
                      '名詞', '指示詞', '動詞', '形容詞', '判定詞', '助動詞', '副詞', '助詞',
                      '接続詞', '連体詞', '感動詞', '接頭辞', '特殊', '未定義語'
                  ],
                  remove_stopwords=False,
                  genkei=False):
        wkt = ""
        text = mojimoji.han_to_zen(text)
        rst = self.jumanpp.analysis(text)
        for mrph in rst.mrph_list():
            # midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname
            if remove_stopwords and (mrph.genkei in self.stopwords):
                continue
            if mrph.hinsi in allow_word_class:
                if genkei:
                    wkt += mrph.genkei + ' '
                else:
                    wkt += mrph.midasi + ' '
        return wkt

コード例 #12

0

ファイルを表示

ファイル: knp.py プロジェクト: kzinmr/pyknp-extend

    def __init__(self, command='knp', option='-tab', rcfile='',
                 server=None, port=31000, timeout=30,
                 pattern=r'(?:^|\n)EOS($|\n)',
                 jumanrcfile='', juman_option='-e2 -B', juman_port=32000,
                 juman_command='juman', jumanpp=False):

        self.use_jumanpp = (juman_command == "jumanpp") or jumanpp
        assert 'EOS' in pattern
        self.pattern = pattern
        self.EOS = 'EOS'
        # tab形式しかパースしない
        assert '-tab' in option

        if rcfile and not os.path.isfile(os.path.expanduser(rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile)
            quit(1)

        # Setup Juman(++)
        assert port != juman_port
        juman_args = {'option': juman_option, 'rcfile': jumanrcfile,
                      'server':server, 'port':juman_port}
        if self.use_jumanpp:
            self.juman = Jumanpp(**juman_args)
        else:
            self.juman = Juman(**juman_args)
        # Setup KNP
        if server is not None:
            self.socket = Socket(server, port, option=option, timeout=timeout)
            self.query = partial(self.socket.query, pattern=pattern)
        else:
            if rcfile:
                option += " -r {}".format(rcfile)
            self.subprocess = Subprocess(command, option=option)
            self.query = partial(self.subprocess.query, pattern=pattern)

コード例 #13

0

ファイルを表示

ファイル: knp.py プロジェクト: Kazuuuuuki/utech_crawler

    def __init__(self,
                 command='knp',
                 server=None,
                 port=31000,
                 timeout=60,
                 option='-tab',
                 rcfile='',
                 pattern=r'EOS',
                 jumancommand='juman',
                 jumanrcfile='',
                 jumanpp=False):
        self.command = command
        self.server = server
        self.port = port
        self.timeout = timeout
        self.option = option
        self.rcfile = rcfile
        self.pattern = pattern
        self.socket = None
        self.subprocess = None
        self.jumanpp = (jumancommand == "jumanpp") or jumanpp

        if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)):
            sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile)
            quit(1)

        if (self.jumanpp):
            self.juman = Jumanpp()
        else:
            self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)

コード例 #14

0

ファイルを表示

def segment_ja(texts, flag_keep_number=False):
    jumanpp = Jumanpp()
    results = {}
    for text in texts:
        try:
            parsed = jumanpp.analysis(han_to_zen(text))
            if flag_keep_number:
                segmented = ' '.join(m.midasi for m in parsed.mrph_list())
            else:
                segmented = ' '.join('<数詞>' if m.bunrui == '数詞' else m.midasi
                                     for m in parsed.mrph_list())
            results[text] = segmented
        except Exception:
            pdb.set_trace()
            logger.warning('Cannot parse {}'.format(text))
            continue
    return results

コード例 #15

0

ファイルを表示

    def __init__(self, specific_parts: Optional[List[str]]=None, specific_domains: Optional[List[str]]=None):
        self.juman: Jumanpp = Jumanpp()
        if specific_parts is None:
            specific_parts: List[str] = ['普通名詞']
        if specific_domains is None:
            specific_domains: List[str] = ['料理・食事']
        self.specific_parts: List[str] = specific_parts
        self.specific_domains: List[str] = specific_domains

        self.words: Optional[List[str]] = None

コード例 #16

0

ファイルを表示

def main():
    model_w2v = gensim.models.KeyedVectors.load_word2vec_format(
        "/share/data/word2vec/2016.08.02/w2v.midasi.256.100K.bin",
        binary=True,
        unicode_errors='ignore')
    word2index = {w: i for i, w in enumerate(model_w2v.index2word)}

    model = BiLSTM(embed_mat=model_w2v.vectors, mid_size=128)
    serializers.load_npz("BiLSTM_attention.model", model)

    # 標準入力からテストできるように
    jumanpp = Jumanpp()
    while True:
        input_sentence = sys.stdin.readline()  # 改行を含む, string型
        result = jumanpp.analysis(input_sentence)
        doc = [mrph.midasi for mrph in result.mrph_list()]
        x = [doc2list(doc, word2index)]
        #        x = list2Var([doc2vec(doc)], np.float32, False)
        with chainer.using_config("train", False):
            y, attn_list = model.predict(x)

        p = np.argmax(y[0].data)
        doc_class = ["新聞記事", "  雑誌  ", " 教科書 ", " ブログ "]
        print("")
        print("*------------------------*")
        print("|                        |")
        print("|        " + doc_class[p] + "        |")
        print("|                        |")
        print("*------------------------*")
        print("")

        prob = F.softmax(y, axis=1)[0].data
        print("新聞記事: {:.6f}  雑誌: {:.6f}  教科書: {:.6f}  ブログ: {:.6f}".format(
            prob[0], prob[1], prob[2], prob[3]))

        for word, attn in sorted(zip(doc, attn_list),
                                 key=lambda x: x[1],
                                 reverse=True):
            print(word, end=", ")
        print("\n")

コード例 #17

0

ファイルを表示

ファイル: jumanpp_wrapper_python3.py プロジェクト: yusukefs/JapaneseTokenizers

    def __init__(self,
                 command='jumanpp',
                 timeout=30,
                 pattern=r'EOS',
                 server=None,
                 port=12000,
                 is_use_pyknp=False,
                 **args):
        """* What you can do
        - You can select backend process of jumanpp.
            - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
            - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect 
            - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.
        
        * Parameters
        - timeout: Time to wait from jumanpp process.
        - is_use_pyknp: bool flag to decide if you use pyknp as backend process.  If True; you use pyknp. False; you use pexpect. 
        pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns
        - server: hostname where jumanpp is running
        - port: port number where jumanpp is running
        """
        # type: (str,int,str,str,bool)->None
        self.eos_pattern = pattern
        self.is_use_pyknp = is_use_pyknp
        if not server is None:
            pattern = pattern.encode('utf-8')
        else:
            pass

        if os.name=='nt':
            """It forces to use pyknp if it runs on Windows."""
            if not self.is_use_pyknp:
                logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True")
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is None and self.is_use_pyknp:
            # jumanpp-pexpect #
            self.jumanpp_obj = Jumanpp(
                command=command,
                timeout=timeout,
                pattern=pattern,
                **args)
        elif server is None:
            # jumanpp-pexpect #
            self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern)
        else:
            # jumanpp-server #
            self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)

コード例 #18

0

ファイルを表示

ファイル: jumanpp_backend.py プロジェクト: shotastage/MLTweeter

def parse(line):
    if line == None:
        return
    if line == "\n":
        return

    jumanpp = Jumanpp()

    replaced = re.sub('\n|\u3000| ', '', line)
    result = jumanpp.analysis(replaced)

    words = []

    for mrph in result.mrph_list():

        if not mrph == None:
            print('{0}                読み: {1}  品詞: {2}  活用1: {3}  活用2: {4}'.
                  format(mrph.midasi, mrph.yomi, mrph.hinsi, mrph.katuyou1,
                         mrph.katuyou2))
            words.append(mrph.midasi)

    return words

コード例 #19

0

ファイルを表示

ファイル: bow_juman.py プロジェクト: tuxedocat/ntcir13-medweb

def parser_func_jumanpp(lemmatize: bool = True) -> Callable[[str], List[str]]:
    jumanpp = Jumanpp()
    if lemmatize:

        def f(s: str) -> List[str]:
            return [m.genkei for m in jumanpp.analysis(s)]

        return f
    else:

        def g(s: str) -> List[str]:
            return [m.midasi for m in jumanpp.analysis(s)]

        return g

コード例 #20

0

ファイルを表示

ファイル: jumanpp_wrapper_python3.py プロジェクト: natsukoa/JapaneseTokenizers

    def __init__(self, command='jumanpp', timeout=30, pattern=r'EOS', server:str=None, port:int=12000, **args):
        # type: (str, int, str, str) -> None
        if not server is None:
            pattern = pattern.encode('utf-8')


        self.eos_pattern = pattern
        if server is None:
            self.jumanpp_obj = Jumanpp(
                command=command,
                timeout=timeout,
                pattern=pattern,
                **args)
        else:
            self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)

コード例 #21

0

ファイルを表示

 def __init__(self,
              command='jumanpp',
              timeout=30,
              pattern='EOS',
              server=None,
              port=12000,
              **args):
     # type: (str, int, str, str, int, Dict[str,Any]) -> None
     self.eos_pattern = pattern
     if server is None:
         self.jumanpp_obj = Jumanpp(command=command,
                                    timeout=timeout,
                                    pattern=pattern,
                                    **args)
     else:
         self.jumanpp_obj = JumanppClient(hostname=server,
                                          port=port,
                                          timeout=timeout)

コード例 #22

0

ファイルを表示

ファイル: analyst.py プロジェクト: kazama-k/RestaurantSearcherNLP

    def __init__(
            self, word2vec_model: Word2VecModel,
            juman_command: str='jumanpp',
            specific_parts: Optional[List[str]]=None
    ) -> None:
        if specific_parts is None:
            specific_parts = ['普通名詞']

        if juman_command == 'juman':
            self.juman: Union[Juman, Jumanpp] = Juman()
        elif juman_command == 'jumanpp':
            self.juman: Union[Juman, Jumanpp] = Jumanpp()
        else:
            raise AttributeError
        self.knp: KNP = KNP(jumancommand=juman_command)

        self.specific_parts: List[str] = specific_parts

        self.word2vec: Word2VecModel = word2vec_model

コード例 #23

0

ファイルを表示

class JumanParser(Parser):
    def __init__(self):
        super().__init__()
        remove_pattern = r'・|、|\,|\.| |　'
        self.remove_compiled = re.compile(remove_pattern)
        self.analyzer = Jumanpp()

    def parse(self, message):
        for sent in message.sentences:
            sent.text = self.remove_compiled.sub('', sent.text)
            parsed = self.analyzer.analysis(sent.text)
            mrph_list = parsed.mrph_list()
            sent.bag = self.create_bags(mrph_list)
            message.bags += sent.bag
        return message

    @staticmethod
    def create_bags(mrph_list):
        bag = []
        for mrph in mrph_list:
            if mrph.hinsi == '名詞' or mrph.hinsi == '動詞':
                bag.append(mrph.genkei)
        return bag

コード例 #24

0

ファイルを表示

class IntentSlotDatasetReader(DatasetReader):
    def __init__(self, lazy=False, max_tokens=64):
        super().__init__(lazy)
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
        self.jumanpp = Jumanpp()

    def _read(self, file_path):
        with open(file_path, 'r') as f:
            for line in f:
                line = line.strip().split()
                label = line[-1]
                line = [tt.split(':') for tt in line[:-2]]
                text = [Token(tt[0]) for tt in line][0:self.max_tokens]
                tags = [tt[1] for tt in line][0:self.max_tokens]
                yield self.text_to_instance(text, label, tags)

    def tokenizer(self, text):
        text = [
            Token(mrph.midasi)
            for mrph in self.jumanpp.analysis(text).mrph_list()
        ][0:self.max_tokens]
        return text

    def text_to_instance(self, text, label=None, tags=None):
        text_field = TextField(text, self.token_indexers)
        fields = {'text': text_field}
        if label:
            label_field = LabelField(label, label_namespace='labels')
            fields['label'] = label_field
        if tags:
            tags_field = SequenceLabelField(tags,
                                            text_field,
                                            label_namespace='tags')
            fields['tag'] = tags_field
        return Instance(fields)

コード例 #25

0

ファイルを表示

 def __init__(self):
     super().__init__()
     remove_pattern = r'・|、|\,|\.| |　'
     self.remove_compiled = re.compile(remove_pattern)
     self.analyzer = Jumanpp()

コード例 #26

0

ファイルを表示

ファイル: data_helper.py プロジェクト: kiyotaka-tanaka/text_classification

import os
import sys
import json
import pickle
import numpy as np
import gensim as gs
import pandas as pd
from dict import Vocabulary
from pyknp import Jumanpp

jumanpp = Jumanpp()

def load_embeddings(vocabulary):
    word_embeddings ={}
    for word in vocabulary:
        word_embeddings[word] = np.random.uniform(-0.25,0.25,300)
    return word_embeddings
def pad_data(data,size,pad_index):
    new_data = []
    for data_ in data:
        if len(data_) >= size:
            data_ = data_[:size]
        else:
            while len(data_) < size:
                data_.append(pad_index)
        new_data.append(data_)

    return new_data

def batch_iter(data,batch_size,num_epochs, shuffle=True):
    data= np.array(data)

コード例 #27

0

ファイルを表示

ファイル: learner.py プロジェクト: akimach/hsproom

def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

コード例 #28

0

ファイルを表示

ファイル: jumantest.py プロジェクト: ToyamaLab/structured_crawler

#-*- encoding: utf-8 -*-
from pyknp import Jumanpp
import sys
import codecs
# sys.stdin = codecs.getreader('utf_8')(sys.stdin)
# sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
# Use Juman++ in subprocess mode
jumanpp = Jumanpp()
result = jumanpp.analysis(u"ケーキを食べる")
for mrph in result.mrph_list():
    print("見出し:{0}".format(mrph.midasi))

コード例 #29

0

ファイルを表示

ファイル: tsv2list.py プロジェクト: Fritillaria364/utils


with open(args[1], "r") as f:
    textslist = [s.split("\t")[1].strip() for s in f if len(s.split("\t")) > 1]
with open(args[2], "r") as f:
    wordslist = [s.split("\t")[1].strip() for s in f if len(s.split("\t")) > 1]

tlist = [
    w.replace(" ", "_") for w in textslist if w is not "" and not isAscii(w)
]
wlist = [w.replace(" ", "_") for w in wordslist if w is not ""]

t_midasi = []
w_midasi = []

jumanpp = Jumanpp()
"""
for i,s in enumerate(tlist):
    print("Processing Text:{}".format(i))
    if s == "":
        continue
    result = jumanpp.analysis(s)
    midasi_lst = []
    for w in result.mrph_list():
        midasi_lst.append([w.midasi.replace("_"," "),"O"])
    t_midasi.append(midasi_lst)
"""
print("-----------------")

for i, s in enumerate(wlist):
    print("Processing Word:{}".format(i))

コード例 #30

0

ファイルを表示

from pyknp import Jumanpp

parser = argparse.ArgumentParser()

parser.add_argument("--input_text",
                    help="classify text",
                    type=str,
                    default="日本でのビジネス")
parser.add_argument("--path_to_model",
                    help="model to use",
                    type=str,
                    default="./models/my-model.ckpt")

args = parser.parse_args()

jumanpp = Jumanpp()
classify_data = []

vocab = Vocabulary("data_use.txt")

result = jumanpp.analysis(args.input_text)
for mrph in result.mrph_list():
    word = mrph.midasi
    classify_data.append(vocab.stoi(word))

classify_data = data_helper.pad_one(classify_data, 256, 0)

with open("training_config.json") as f:
    params = json.load(f)

embedding_mat = np.load("./models/embedding.npy")

コード例 #31

0

ファイルを表示

ファイル: gensim-doc2vec.py プロジェクト: ssabcire/ML

def split_into_words(text):

    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]

コード例 #32

0

ファイルを表示

ファイル: gensim-doc2vec.py プロジェクト: ssabcire/ML

def split_into_words(text):
    '''記事を単語リストに変換する'''
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]