Python RakutenMAの例、rakutenma.RakutenMA Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_create_hash_func(self):
     rma = RakutenMA()
     hash_func = rma.create_hash_func(4)
     assert_equals(hash_func("feat1", "foo"), ["5"])
     assert_equals(hash_func("feat1", "bar"), ["2"])
     assert_equals(hash_func("feat1", "baz"), ["10"])
     assert_equals(hash_func("feat1", "qux"), ["3"])

コード例 #2

0

ファイルを表示

ファイル: FeatureUnion.py プロジェクト: squidnee/lingo-bean

 def tagWordsInSentences(self, studying, entry):
     '''Tags the part of speech for each word.'''
     jar_path = 'stanford-postagger-full/stanford-postagger.jar'
     if studying in self.english:
         words = parseWordsFromEntry(entry)
         tagged_words = tagWords(words)
         return tagged_words
     elif studying in self.japanese or self.korean or self.mandarin:
         #segmenter = TinySegmenter()
         #words = segmenter.tokenize(entry)
         rm = RakutenMA()
         tagged_words = rm.tokenize(entry)
         #mecab = Mecab()
         #tagged_words = mecab.pos(entry)
         return tagged_words
     else:
         if studying in self.spanish:
             model_path = 'stanford-postagger-full/models/spanish.tagger'
             words = parseWordsFromEntry(entry)
         elif studying in self.french:
             model_path = 'stanford-postagger-full/models/french.tagger'
             words = parseWordsFromEntry(entry)
         postagger = StanfordPOSTagger(model_path,
                                       jar_path,
                                       encoding='utf8')
         tagged_words = postagger.tag(words)
         return tagged_words

コード例 #3

0

ファイルを表示

class ExplainJapaneseSentences(BaseFilter):
    def __init__(self):
        super().__init__()

        # Initialize a RakutenMA instance with an empty model
        # the default ja feature set is set already
        self.rma = RakutenMA()

        # Initialize a RakutenMA instance with a pre-trained model
        self.rma = RakutenMA(
            phi=1024, c=0.007812
        )  # Specify hyperparameter for SCW (for demonstration purpose)

        # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model
        self.rma.load(abspath(r'..\resource\model_ja.min.json'))

    def __call__(self, chunk):
        chunk = self._duplicate_chunk(chunk)
        chunk.final = True

        result = [chunk]
        text = self.tokenize(chunk.text)
        result.append(
            TextChunk(text=text,
                      language='japanese',
                      audible=False,
                      printable=True,
                      final=True))
        return result

    def tokenize(self, text):
        tokens = self.rma.tokenize(text)
        return ' '.join(map(lambda pair: f'{pair[0]} ({pair[1]})', tokens))

コード例 #4

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_train_one(self):
        rma = RakutenMA()
        rma.featset = ["w0"]

        res = rma.train_one([["foo", "N-nc"], ["bar", "N-nc"]])
        assert_true(res["updated"])
        assert_true(Trie.find(rma.model["mu"], ["w0", "f", "B-N"]) > 0)
        assert_true(Trie.find(rma.model["mu"], ["w0", "o", "I-N"]) > 0)
        assert_true(Trie.find(rma.model["mu"], ["w0", "o", "E-N"]) > 0)
        assert_equals(rma.tokenize("foobar"), [["foo", "N-nc"], ["bar", "N-nc"]])

コード例 #5

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_count_tps(self):
        # the last "a" doesn"t match because of offset of "d+"
        sent1 = ["a", "b", "c", "d", "a"]
        sent2 = ["a", "b", "c", "d+", "a"]
        assert_equals(RakutenMA.count_tps(sent1, sent2), 3)

        # ignores pos tags for comparison
        sent1 = [["x", "pos1"], ["y", "pos2"], ["z", "pos3"]]
        sent2 = [["x", "pos0"], ["u", "pos2"], ["v", "pos3"], ["x", "pos1"]]
        assert_equals(RakutenMA.count_tps(sent1, sent2), 1)

コード例 #6

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_ctype_ja_default_func(self):
     rma = RakutenMA()
     assert_equals(rma.ctype_ja_default_func("あ"), "H")
     assert_equals(rma.ctype_ja_default_func("ア"), "K")
     assert_equals(rma.ctype_ja_default_func("Ａ"), "A")
     assert_equals(rma.ctype_ja_default_func("ａ"), "a")
     assert_equals(rma.ctype_ja_default_func("漢"), "C")
     assert_equals(rma.ctype_ja_default_func("百"), "S")
     assert_equals(rma.ctype_ja_default_func("0"), "N")
     assert_equals(rma.ctype_ja_default_func("・"), "n")

コード例 #7

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_decode(self):
        rma = RakutenMA()
        rma.hash_func = None
        csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)
        for i in range(len(csent)):
            csent[i].l = ""

        rma.model["mu"] = WEIGHTS
        csent = rma.decode(csent)
        assert_equals(csent[0].l, "_")
        assert_equals(csent[1].l, "B-N")
        assert_equals(csent[2].l, "I-N")
        assert_equals(csent[3].l, "E-N")
        assert_equals(csent[4].l, "B-N")
        assert_equals(csent[5].l, "I-N")
        assert_equals(csent[6].l, "E-N")
        assert_equals(csent[7].l, "_")

        csent = rma.tokens2csent([["foX", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)
        csent = rma.decode(csent)
        assert_equals(csent[0].l, "_")
        assert_equals(csent[1].l, "B-N")
        assert_equals(csent[2].l, "I-N")
        assert_equals(csent[3].l, "O")
        assert_equals(csent[4].l, "B-N")
        assert_equals(csent[5].l, "I-N")
        assert_equals(csent[6].l, "E-N")
        assert_equals(csent[7].l, "_")

コード例 #8

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_csent2tokens(self):
        sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]]
        rma = RakutenMA()
        csent = rma.tokens2csent(sent, "SBIEO")
        sent = RakutenMA.csent2tokens(csent, "SBIEO")

        assert_equals(sent[0][0], "hoge")
        assert_equals(sent[0][1], "X")
        assert_equals(sent[1][0], "fuga")
        assert_equals(sent[1][1], "Y")
        assert_equals(sent[2][0], "p")
        assert_equals(sent[2][1], "Z")

        assert_raises(Exception, RakutenMA.csent2tokens, (csent, "UNKNOWN_SCHEME"))

コード例 #9

0

ファイルを表示

    def __init__(self):
        super().__init__()

        # Initialize a RakutenMA instance with an empty model
        # the default ja feature set is set already
        self.rma = RakutenMA()

        # Initialize a RakutenMA instance with a pre-trained model
        self.rma = RakutenMA(
            phi=1024, c=0.007812
        )  # Specify hyperparameter for SCW (for demonstration purpose)

        # https://github.com/ikegami-yukino/rakutenma-python/tree/master/rakutenma/model
        self.rma.load(abspath(r'..\resource\model_ja.min.json'))

コード例 #10

0

ファイルを表示

ファイル: scorer.py プロジェクト: saitamandd/japanese_corpus

    def __init__(self, kv_filepath, model):
        self.rma = RakutenMA(json.loads(open(model).read()))
        self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15)


        self.ja_to_en = defaultdict(list)
        self.en_to_ja = defaultdict(list)

        for l in open(kv_filepath):
            [k, v] = l.strip().split(',')[:2]
            raw = unicode(k, 'utf-8')
#            lemma = self.rma.tokenize(raw)[0][0]
            self.ja_to_en[raw].append(v)
            self.en_to_ja[v].append(raw)

コード例 #11

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_tokens2csent(self):
        sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]]

        rma = RakutenMA()
        assert_raises(Exception, rma.tokens2csent, (sent, "UNKNOWN_SCHEME"))

        csent = rma.tokens2csent(sent, "SBIEO")
        assert_equals(csent[1].c, "h")
        assert_equals(csent[1].l, "B-X")
        assert_equals(csent[2].c, "o")
        assert_equals(csent[2].l, "I-X")
        assert_equals(csent[4].c, "e")
        assert_equals(csent[4].l, "E-X")
        assert_equals(csent[9].c, "p")
        assert_equals(csent[9].l, "S-Z")

コード例 #12

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_str2csent(self):
        rma = RakutenMA()
        actual = rma.str2csent("hoge")
        desired = [
            Token(l=_BEOS_LABEL),
            Token(c="h", t=rma.ctype_ja_default_func("h")),
            Token(c="o", t=rma.ctype_ja_default_func("o")),
            Token(c="g", t=rma.ctype_ja_default_func("g")),
            Token(c="e", t=rma.ctype_ja_default_func("e")),
            Token(l=_BEOS_LABEL)]

        assert_equals(len(actual), len(desired))
        for i in range(len(actual)):
            assert_equals(actual[i].c, desired[i].c)
            assert_equals(actual[i].t, desired[i].t)
            assert_equals(actual[i].f, desired[i].f)
            assert_equals(actual[i].l, desired[i].l)

コード例 #13

0

ファイルを表示

def tokenize(x, t):
    if t in TOKC:
        from jieba import posseg
        toks = posseg.cut(x)
        if t == POSC:
            return u'\u3000'.join([('%s [%s]' % (f.word, f.flag))
                                   for f in toks])
        elif t == SPACEC:
            return u'\u3000'.join([('%s' % (f.word)) for f in toks])
        else:
            return lexDens(toks, t)
    elif t in TOKJ:
        from rakutenma import RakutenMA
        rma = RakutenMA()
        rma = RakutenMA(phi=1024, c=0.007812)
        tD, tF = os.path.split(__file__)
        jSon = os.path.join(tD, 'model_ja.min.json')
        rma.load(jSon)
        toks = rma.tokenize(x)
        if t == SPACEJ:
            return u'\u3000'.join([i[0] for i in toks])
        elif t == POSJ:
            return u'\u3000'.join([('%s [%s]' % (i[0], i[1])) for i in toks])
        else:
            return lexDens(toks, t)

コード例 #14

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_eval_corpus(self):
        sent1 = ["a", "b", "c", "d", "a"]
        sent2 = ["a", "b", "c", "d+", "a", "b", "c", "d", "e", "f"]
        res = RakutenMA.eval_corpus([sent1], [sent2])
        assert_equals(res[0], 0.3)
        assert_equals(res[1], 0.6)
        assert_equals(res[2], 0.4)

        assert_raises(Exception, RakutenMA.eval_corpus, (["a"], []))

コード例 #15

0

ファイルを表示

def splitShuffle(expr, t):
    expr = stripHTML(expr).strip()
    if t == SGJ:       
        from rakutenma import RakutenMA    
        rma = RakutenMA(phi=1024, c=0.007812)
        tD, tF = os.path.split(__file__)
        jSon = os.path.join(tD, 'model_ja.min.json')     
        rma.load(jSon)           
        resultl = rma.tokenize(expr)  
        result = [r for r, s in resultl]
    elif t in SGC:    
        import jieba
        result = jieba.cut(expr, cut_all=False)  
    elif t == JSS:    
        result = expr.split(' ')
    elif t in WRAPL:
        result = list(expr)
    newResult, glosses = getResult(result, t)
    jn = u''
    full = jn.join(newResult)
    random.shuffle(newResult)     
    strResult = u''.join(newResult)     
    return strResult, full, glosses

コード例 #16

0

ファイルを表示

def _get_tokenizer(lang):
    rma = None
    if lang == 'ja':
        rma = RakutenMA()
        rma.load('model_ja.json')
        rma.hash_func = rma.create_hash_func(15)
        tokenizer = _jap_tokenizer
        # tokenizer = _jap_character_tokenizer
    else:
        tokenizer = _eng_tokenizer
    return tokenizer, rma

コード例 #17

0

ファイルを表示

ファイル: scorer.py プロジェクト: saitamandd/japanese_corpus

class PairScorer():


    def __init__(self, model):
        print model
        self.rma = RakutenMA(json.loads(open(model).read()))
        self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15)
        return


    def extract_ja_content_lemmas(self, s):
        """ extracxts content words from a japanese sentence
            (nouns, verb roots, adjectives, no okurigana)
        """
        s = unicode(s, 'utf-8')

        out = []
        for [x, y] in self.rma.tokenize(s):
            if y in RAKUTEN_POS_TAGS:
                if y.startswith('V'):                
                    out += [(guess, y) for guess in guess_stem(x)]
                else:
                    out.append( (x, y) )
        return out

    def extract_en_content_lemmas(self, s):
        def penn_to_wordnet(pos):
            p = pos[0].lower()
            if    p == 'j': return 'a'
            elif  p == 'r': return 'r'
            elif  p == 'v': return 'v'
            else:             return 'n'

        lemmatizer = nltk.stem.WordNetLemmatizer()
        s = unicode(s, 'utf-8')
        
        out = []
        for w, pos in nltk.pos_tag(nltk.word_tokenize(s)):
            if pos in PENN_POS_TAGS:
                out.append( (lemmatizer.lemmatize(w, pos=penn_to_wordnet(pos)), pos) )
        return out

コード例 #18

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_csent2feats(self):
     rma = RakutenMA()
     rma.hash_func = None
     rma.featset = ["w0"]
     csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
     csent = rma.add_efeats(csent)
     feats = rma.csent2feats(csent)
     desired = (
         ["w0", "", "_"], ["w0", "f", "B-N"], ["w0", "o", "I-N"],
         ["w0", "o", "E-N"], ["w0", "b", "B-N"], ["w0", "a", "I-N"],
         ["w0", "r", "E-N"], ["t", "B-N", "_"], ["t", "I-N", "B-N"],
         ["t", "E-N", "I-N"], ["t", "B-N", "E-N"], ["t", "_", "E-N"])
     for d in desired:
         assert_true(d in feats)
     assert_true(["t", "E-N", "B-N"] not in feats)
     assert_true(["t", "B-N", "I-N"] not in feats)

コード例 #19

0

ファイルを表示

ファイル: split_tinysegmenter.py プロジェクト: VengTryTan/AIBookCreation

# coding=utf8
from rakutenma import RakutenMA
import tinysegmenter
from nltk import *
import nltk
import re

#segmenter = tinysegmenter.TinySegmenter()
result = tinysegmenter.tokenize(
    "米中間選挙は6日に午後6時（日本時間7日午前8時）に一部の投票所が締め切られ、開票が始まった。米連邦議会の多数党がどちらになるかによって、ドナルド・トランプ米大統領の政策の行方が決まる。特に下院でどれだけ、民主党が共和党現職の議席を奪うかが注目されている。")
print('Segmenter: ')
print(result)

# Initialize a RakutenMA instance with an empty model
# the default ja feature set is set already
rma = RakutenMA()

# Let's analyze a sample sentence (from http://tatoeba.org/jpn/sentences/show/103809)
# With a disastrous result, since the model is empty!
print('Result')
print(rma.tokenize(result))
print('Original')
print(rma.tokenize("米中間選挙は6日に午後6時（日本時間7日午前8時）に一部の投票所が締め切られ、開票が始まった。"))
print('------------------')
# print(rma.tokenize("子どものみなさん、ゆるしてください。ぼくはこの本をひとりのおとなのひとにささげます。でもちゃんとしたわけがあるのです。"))
# print(rma.tokenizetwo("彼は新しい仕事できっと成功するだろう。"))
# print(rma.tokenize("彼は新しい仕事できっと成功するだろう。"))

# Feed the model with ten sample sentences from tatoeba.com
# "tatoeba.json" is available at https://github.com/rakuten-nlp/rakutenma

コード例 #20

0

ファイルを表示

ファイル: scorer.py プロジェクト: saitamandd/japanese_corpus

 def __init__(self, model):
     print model
     self.rma = RakutenMA(json.loads(open(model).read()))
     self.rma.hash_func = RakutenMA.create_hash_func(self.rma, 15)
     return

コード例 #21

0

ファイルを表示

 def __init__(self):
     rma = RakutenMA()
     rma.load("model_ja.json")
     rma.hash_func = rma.create_hash_func(15)
     self.rma = rma

コード例 #22

0

ファイルを表示

    del counter['DET']
    del counter['DETWH']
    counter['ET'] += 0
    counter['I'] += 0
    counter['NC'] += counter['N'] + counter['VINF']; del counter['N']; del counter['VINF']
    counter['NP'] += counter['NPP']; del counter['NPP']
    del counter['P']   # The Japanese tag set doesn't account for prepositions. We could manually look for them using a table like this: http://mylanguages.org/japanese_prepositions.php
    counter['PREF'] += 0
    counter['PRO'] += counter['PROREL'] + counter['PROWH']; del counter['PROREL']; del counter['PROWH']
    counter['V'] += counter['VIMP'] + counter['VPR'] + counter['VS']; del counter['VIMP']; del counter['VPR']; del counter['VS']
    counter['PUNC'] += counter['.$$.']; del counter['.$$.']

    return dict(counter)


rma = RakutenMA()
rma.load("model_ja.json")
def _analyze_ja(text):
    tags = rma.tokenize(text)
    counter = collections.Counter([x[1] for x in tags])  # see the same premise above in the French section
    subordinating_conjunctions = list(filter(lambda tup: tup[1] == 'C' and tup[0] in jsc, tags))

    return {  # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two
        'ADJ':    counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'],
        'ADV':  counter['F'],
        'CC':   counter['C'] - len(subordinating_conjunctions),
        'CS':   len(subordinating_conjunctions),
        'ET':   counter['E'],
        'I':    counter['I-c'],
        'NC':   counter['N-n'] + counter['N-nc'],
        'NP':   counter['N-pn'],

コード例 #23

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_tokens2string(self):
     sent = [["hoge", "X"], ["fuga", "Y"], ["p", "Z"]]
     assert_equals(RakutenMA.tokens2string(sent), "hoge [X] | fuga [Y] | p [Z]")

コード例 #24

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_tokens_identical(self):
     assert_false(RakutenMA.tokens_identical([["a"]], [[]]))
     assert_false(RakutenMA.tokens_identical([["a"]], [["b"]]))
     assert_false(RakutenMA.tokens_identical([["a", "pos1"]], [["a", "pos2"]]))
     assert_true(RakutenMA.tokens_identical([["a", "pos1"]], [["a", "pos1"]]))

コード例 #25

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_create_ctype_chardic_func(self):
     rma = RakutenMA()
     cfunc = rma.create_ctype_chardic_func({"a": ["type1"], "b": ["type2"]})
     assert_equals(cfunc("a"), ["type1"])
     assert_equals(cfunc("b"), ["type2"])
     assert_equals(cfunc("c"), [])

コード例 #26

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_tokenize_corpus(self):
     test_corpus = [[["abra", "pos1"], ["cadabra", "pos2"]]]
     tokenize_func = lambda s: list(s)
     desired = [["a", "b", "r", "a", "c", "a", "d", "a", "b", "r", "a"]]
     assert_equals(RakutenMA.tokenize_corpus(tokenize_func, test_corpus), desired)

コード例 #27

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_add_efeats(self):
        # feature functions test
        rma = RakutenMA()
        rma.hash_func = None
        rma.featset = ["w0"]
        csent = rma.str2csent("A1-b")
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["w0", ""]])
        assert_equals(csent[1].f, [["w0", "A"]])
        assert_equals(csent[2].f, [["w0", "1"]])
        assert_equals(csent[3].f, [["w0", "-"]])
        assert_equals(csent[4].f, [["w0", "b"]])
        assert_equals(csent[5].f, [["w0", ""]])

        rma.featset = ["b1"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["b1", "", "A"]])
        assert_equals(csent[1].f, [["b1", "A", "1"]])
        assert_equals(csent[2].f, [["b1", "1", "-"]])
        assert_equals(csent[3].f, [["b1", "-", "b"]])
        assert_equals(csent[4].f, [["b1", "b", ""]])
        assert_equals(csent[5].f, [["b1", "", ""]])

        rma.featset = ["c0"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["c0", ""]])
        assert_equals(csent[1].f, [["c0", "A"]])
        assert_equals(csent[2].f, [["c0", "N"]])
        assert_equals(csent[3].f, [["c0", "O"]])
        assert_equals(csent[4].f, [["c0", "a"]])
        assert_equals(csent[5].f, [["c0", ""]])

        rma.featset = ["d9"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["d9", "", ""]])
        assert_equals(csent[1].f, [["d9", "", "A"]])
        assert_equals(csent[2].f, [["d9", "A", "N"]])
        assert_equals(csent[3].f, [["d9", "N", "O"]])
        assert_equals(csent[4].f, [["d9", "O", "a"]])
        assert_equals(csent[5].f, [["d9", "a", ""]])

        rma.featset = ["t0"]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["t0", "", "", "A"]])
        assert_equals(csent[1].f, [["t0", "", "A", "1"]])
        assert_equals(csent[2].f, [["t0", "A", "1", "-"]])
        assert_equals(csent[3].f, [["t0", "1", "-", "b"]])
        assert_equals(csent[4].f, [["t0", "-", "b", ""]])
        assert_equals(csent[5].f, [["t0", "b", "", ""]])

        # test a custom function for feature
        # args _t: a function which receives position i and returns the token,
        #          taking care of boundary cases
        #       i: current position
        # sample function -> returns if the character is a capitalized letter
        rma.featset = [lambda _t, i: ["CAP", "T" if _t(i).t == "A" else "F"]]
        csent = rma.add_efeats(csent)
        assert_equals(csent[0].f, [["CAP", "F"]])
        assert_equals(csent[1].f, [["CAP", "T"]])
        assert_equals(csent[2].f, [["CAP", "F"]])
        assert_equals(csent[3].f, [["CAP", "F"]])
        assert_equals(csent[4].f, [["CAP", "F"]])
        assert_equals(csent[5].f, [["CAP", "F"]])

        rma.featset = ["NONEXISTENT_FEATURE"]
        assert_raises(Exception, rma.add_efeats, csent)

コード例 #28

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_set_model(self):
     rma = RakutenMA()
     rma.set_model({"mu": {"feat1": 0.3}, "sigma": {"feat1": 0.4}})
     assert_equals(rma.scw.mu, {"feat1": 0.3})
     assert_equals(rma.scw.sigma, {"feat1": 0.4})

コード例 #29

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_set_tag_scheme(self):
     rma = RakutenMA()
     rma.set_tag_scheme("IOB2")
     assert_equals(rma.tag_scheme, "IOB2")

コード例 #30

0

ファイルを表示

import os
import json
import pickle
from pathlib import Path
from transformers import BertTokenizer
from BertDataset import BertDataset
from tqdm import tqdm
import pandas as pd
import numpy as np
import unicodedata
import re
from rakutenma import RakutenMA

import sys

rma = RakutenMA()  # (default: phi = 2048, c = 0.003906)
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese',
                                          do_lower_case=False)


def main(args):

    with open(args.config_path / 'config.json') as f:
        config = json.load(f)

    rma.load(args.config_path / "model_ja.min.json")
    rma.hash_func = rma.create_hash_func(15)

    print(f"config:{config}")

    # loading datasets from excel files

コード例 #31

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

 def test_string2hash(self):
     rma = RakutenMA()
     assert_equals(rma.string2hash("hoge"), 3208229)
     assert_equals(rma.string2hash("piyopiyo"), -105052642)
     assert_equals(rma.string2hash(""), 0)

コード例 #32

0

ファイルを表示

ファイル: MA_server.py プロジェクト: sm004/japanese-scripts

def create_tokenizer():
    rma = RakutenMA()
    rma.load('model_ja.json')
    rma.hash_func = rma.create_hash_func(15)
    return rma.tokenize

コード例 #33

0

ファイルを表示

ファイル: test_rakutenma.py プロジェクト: pombredanne/rakutenma-python

    def test_calc_states0(self):
        rma = RakutenMA()
        rma.hash_func = None
        rma.featset = ["c0", "w0"]
        csent = rma.tokens2csent([["foo", "N"], ["bar", "N"]], "SBIEO")
        csent = rma.add_efeats(csent)

        assert_equals(rma.calc_states0(csent[1].f, WEIGHTS),
                      {"B-N": 2, "I-N": 1, "E-N": 1})
        assert_equals(rma.calc_states0(csent[2].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 2})
        assert_equals(rma.calc_states0(csent[3].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 2})
        assert_equals(rma.calc_states0(csent[4].f, WEIGHTS),
                      {"B-N": 2, "I-N": 1, "E-N": 1})
        assert_equals(rma.calc_states0(csent[5].f, WEIGHTS),
                      {"B-N": 1, "I-N": 2, "E-N": 1})
        assert_equals(rma.calc_states0(csent[6].f, WEIGHTS),
                      {"B-N": 1, "I-N": 1, "E-N": 2})