Python MeCab Exemples, mecab.MeCab Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : g2pk.py Projet : whaozl/g2pK

 def get_mecab(self):
     try:
         return mecab.MeCab()
     except Exception as e:
         raise Exception(
             'If you want to install mecab, The command is... pip install python-mecab-ko'
         )

Exemple #2

0

Afficher le fichier

def train():

    # tokenizer
    meCab = mecab.MeCab()

    train_X, train_y, test_X, test_y = make_data('corpus', testprob=0.1)
    #print(len(train_X), len(train_y))

    print('--- Get vocabulary')
    with open('vocab.pickle', 'rb') as f:
        vocab = pickle.load(f)
    print('--- Load vocabulary successfully')
    print('%% Vacabulary size:', len(vocab))

    count_vect = CountVectorizer(tokenizer=meCab.morphs,
                                 ngram_range=(1, 3),
                                 max_features=10000,
                                 vocabulary=vocab)

    X_train_counts = count_vect.transform(train_X)
    print("The number of features: {}".format(X_train_counts.shape[1]))

    tfidf_transformer = TfidfTransformer(use_idf=False,
                                         smooth_idf=False,
                                         norm='l2')
    X_train_tfidf = tfidf_transformer.transform(X_train_counts)
    #print(X_train_tfidf.shape)

    # Naive Beyesian
    # clf = MultinomialNB().fit(X_train_tfidf, train_y)

    # SVM
    clf_svm = SGDClassifier().fit(X_train_tfidf, train_y)

    # Evaluation
    X_test_counts = count_vect.transform(test_X)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    print()
    predicted = clf_svm.predict(X_test_tfidf)
    print("SVM: ", np.mean(predicted == test_y))

    print()
    print("Examples: ")
    print("Input\t   Predicted\t  Correct")
    for i in range(3):
        print("%s\t=> %s\t: %s" % (test_X[i], predicted[i], test_y[i]))

    # model save
    # first, delete old model
    os.remove('model/hmc.model')
    print()
    with open('model/hmc.model', 'wb') as f:
        pickle.dump(clf_svm, f)

    #pickle.dump(count_vect, open('model/count.pickle', 'wb'))
    #pickle.dump(X_train_tfidf, open('model/train_feature.pickle', 'wb'))
    #pickle.dump(X_test_tfidf, open('model/test_feature.pickle', 'wb'))
    print('SVM classifier model saved at "model/hmc.model"')
    print('If you want to load the model, use "pickle.load" in python.')

Exemple #3

0

Afficher le fichier

Fichier : get_newword.py Projet : sssueing/FastTextKorean

def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10:
                continue
            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(
                        word, 'T', word)
                else:
                    dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(
                        word, 'F', word)
                # print("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}".format(word, ' '.join(tokens),
                                                      cnt, len(words)) + '\n')

Exemple #4

0

Afficher le fichier

Fichier : machine_reading_comprehension.py Projet : konamgil/pororo

    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "brainbert" in self.config.n_model:
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            from pororo.models.brainbert import BrainRobertaModel
            from pororo.utils import postprocess_span

            model = (BrainRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))

            tagger = mecab.MeCab()

            return PororoBertMrc(model, tagger, postprocess_span, self.config)

Exemple #5

0

Afficher le fichier

Fichier : test_utils.py Projet : Kyubyong/pororo-1

    def test_modules(self):
        try:
            import mecab
            tagger = mecab.MeCab()

            res = postprocess_span(tagger, "이민자들은")
            self.assertIsInstance(res, str)
            self.assertEqual(res, "이민자들")

            res = postprocess_span(tagger, "8100억원에")
            self.assertIsInstance(res, str)
            self.assertEqual(res, "8100억원")

            res = postprocess_span(tagger, "1960년대부터")
            self.assertIsInstance(res, str)
            self.assertEqual(res, "1960년대")

            res = postprocess_span(tagger, "군사 목적으로는,")
            self.assertIsInstance(res, str)
            self.assertEqual(res, "군사 목적")

        except ModuleNotFoundError as error:
            raise error.__class__(
                "Please install python-mecab-ko with: `pip install python-mecab-ko`"
            )

Exemple #6

0

Afficher le fichier

Fichier : LogParser.py Projet : zerobell-lee/ChatLogAnalyzer

 def __init__(self, **kwargs):
     self.system = kwargs['system']
     self.datetimeParser = self.getParser()
     self.mecab = mecab.MeCab()
     if 'topKw' in kwargs:
         self.topKw = kwargs['topKw']
     else:
         self.topKw = 30

Exemple #7

0

Afficher le fichier

def mecab_tokenize(corpus_fname, output_fname):
    mcab = mecab.MeCab()

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = mcab.morphs(sentence)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')

Exemple #8

0

Afficher le fichier

Fichier : get_freq_newword.py Projet : sssueing/FastTextKorean

def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')

Exemple #9

0

Afficher le fichier

def decode(sentence):
    # tokenizer
    meCab = mecab.MeCab()

    print('--- Get vocabulary')
    try:
        with open('vocab.pickle', 'rb') as f:
            vocab = pickle.load(f)
    except FileNotFoundError:
        print('Loading vocabulary ERROR. There is no vocabulary.')
        return None

    print('--- Load vocabulary successfully: vocab.pickle')
    print('%% Vocabulary size:', len(vocab))
    print(vocab)
    print()

    try:
        with open('model/hmc.model', 'rb') as f:
            model = pickle.load(f)
            print("--- Loading model Successfully: model/hmc.model")
    except FileNotFoundError:
        print("Loading model Failed. There is no model.")
        return None

    print(model)
    print()

    #count_vect = pickle.load(open('model/count.pickle', 'rb'))

    count_vect = CountVectorizer(tokenizer=meCab.morphs,
                                 ngram_range=(1, 3),
                                 max_features=10000,
                                 vocabulary=vocab)

    tfidf_vect = TfidfVectorizer(tokenizer=meCab.morphs,
                                 ngram_range=(1, 3),
                                 max_features=10000,
                                 vocabulary=vocab)

    # vectorize
    sent_counts = count_vect.transform([sentence])

    tfidf_transformer = TfidfTransformer(use_idf=False,
                                         smooth_idf=False,
                                         norm='l2')
    sent_tfidf = tfidf_transformer.transform(sent_counts)

    pred = model.predict(sent_tfidf)

    print('Input:', sentence)
    print('Prediction:', pred)

    return pred

Exemple #10

0

Afficher le fichier

    def tokenizer(sent):
        meCab = mecab.MeCab()
        result = []

        tags = meCab.parseToNode(sent)
        while tags:
            #output = '%s/%s' % (tags.surface, tags.feature.split(',')[0])
            if tags.surface:
                result.append(tags.surface)

            tags = tags.next

        return result

Exemple #11

0

Afficher le fichier

    def __init__(self, rawpath, isEng=False):
        self.isEng = isEng

        # inflect engine: number_to_string(int)
        self.p = inflect.engine()

        # mecab class
        self.meCab = mecab.MeCab()

        # Loading raw corpus file
        self.raw = dc.load_raw(rawpath)

        self.contents = self.load_contents('work/contents/')

Exemple #12

0

Afficher le fichier

 def get_mecab(self):
     if os.name == 'nt':
         from konlpy.tag import Mecab
         mecab_ko_dic_path = os.environ[
             'MECAB_KO_DIC_PATH'] if 'MECAB_KO_DIC_PATH' in os.environ else r"C:\mecab\mecab-ko-dic"
         return Mecab(dicpath=mecab_ko_dic_path)
     else:
         import mecab
         try:
             return mecab.MeCab()
         except Exception as e:
             raise Exception(
                 'If you want to install mecab, The command is... pip install python-mecab-ko'
             )

Exemple #13

0

Afficher le fichier

    def __init__(self, mode = 'serve', data_path='./dataset/train.tsv'):
        self.mode = mode
        self.data_path = data_path
        self.train_corpus = None
        if(self.mode == 'train'):
            dataset = pd.read_table(self.data_path)
            dataset = dataset[['titles', 'labels']]
            self.train_corpus = dataset['titles'].tolist()

        # python -m spacy download en
        # python -m spacy download en_core_web_sm
        # python -m spacy link en_core_web_sm en

        self.tokenizer = self.soynlp_tokenizer()
        self.nlp = spacy.load('en')         
        self.mecab = mecab.MeCab()
        self.okt = Okt()

Exemple #14

0

Afficher le fichier

Fichier : pororo_reader.py Projet : VumBleBot/odqa_baseline_code

    def load(self, device: str):

        if "brainbert" in self.config.n_model:
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            # from pororo.models.brainbert import BrainRobertaModel
            from pororo.utils import postprocess_span

            model = My_BrainRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang).eval().to(device)

            tagger = mecab.MeCab()

            return PororoBertMrc(model, tagger, postprocess_span, self.config)

Exemple #15

0

Afficher le fichier

Fichier : kosplitter.py Projet : 314eter/recoll

 def _init_tagger(self, taggername):
     global usingkonlpy
     if not usingkonlpy and taggername != "Mecab":
         from konlpy.tag import Okt, Mecab, Komoran
         usingkonlpy = True
     if taggername == "Okt":
         self.tagger = konlpy.tag.Okt()
         self.tagsOkt = True
     elif taggername == "Mecab":
         if usingkonlpy:
             # Use Mecab(dicpath="c:/some/path/mecab-ko-dic") for a
             # non-default location. (?? mecab uses rcfile and dicdir not
             # dicpath)
             self.tagger = konlpy.tag.Mecab()
         else:
             self.tagger = mecab.MeCab()
         self.tagsMecab = True
     elif taggername == "Komoran":
         self.tagger = konlpy.tag.Komoran()
         self.tagsKomoran = True
     else:
         raise Exception("Bad tagger name " + taggername)

Exemple #16

0

Afficher le fichier

Fichier : handler.py Projet : syncdoth/mrc_serve

    def initialize(self, ctx):
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")
        logger.debug('Will load from {0}'.format(model_dir))
        # Read model serialize/pt file
        x = hub_utils.from_pretrained(model_dir,
                                      "model.pt",
                                      DATA_PATH,
                                      load_checkpoint_heads=True)
        model_interface = BrainRobertaHubInterface(
            x["args"],
            x["task"],
            x["models"][0],
            model_dir,
        ).to(self.device)

        tagger = mecab.MeCab()

        self.model = PororoBertMrc(
            model_interface, tagger, postprocess_span,
            TaskConfig("mrc", "ko", "brainbert.base.ko.korquad"))

        # Read the mapping file, index to object name
        # mapping_file_path = os.path.join(model_dir, "index_to_name.json")

        # if os.path.isfile(mapping_file_path):
        #     with open(mapping_file_path) as f:
        #         self.mapping = json.load(f)
        # else:
        #     logger.warning('Missing the index_to_name.json file. Inference output will not include class name.')

        self.initialized = True

Exemple #17

0

Afficher le fichier

def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 100 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            (cho, jung, jong) = hgtk.letter.decompose(word[-1])
            if 'ㄱ' <= jong <= 'ㅎ':
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
            else:
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
            f2.writelines(dic_line + '\n')
            f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')

Exemple #18

0

Afficher le fichier

def make_vocab(corpus_path, save_path):
    meCab = mecab.MeCab()
    fl = glob.glob(corpus_path + "/*.txt")

    all_morphs = []
    all_tri = []
    all_bi = []
    for fn in fl:
        with open(fn, 'r', encoding='utf-8') as f:
            raw = f.readlines()
        for s in raw:
            morphs = meCab.morphs(s)
            for m in morphs:
                all_morphs.append(m)

            for w1, w2, w3 in trigrams(s, pad_left=True, pad_right=True):
                all_tri.append((w1, w2, w3))

            for w1, w2 in bigrams(s, pad_left=True, pad_right=True):
                all_bi.append((w1, w2))

    #all_morphs = nltk.FreqDist(w for w in all_morphs)
    #uni = list(all_morphs)[:200]
    #all_tri = nltk.FreqDist(tri for tri in all_tri)
    #tri = list(all_tri)[:200]
    #all_bi = nltk.FreqDist(bi for bi in all_bi)
    #bi = list(all_bi)[:200]

    #vocab = uni + tri + bi

    vocab = list(set(all_morphs))
    with open(save_path, 'wb') as f:
        pickle.dump(vocab, f)

    print("--- Vacabulary saved in", save_path)
    print('%% The size of vocabulary:', len(vocab))

Exemple #19

0

Afficher le fichier

    def __init__(self, src_lang, tgt_lang, phrase_maxlen):

        langs = {src_lang, tgt_lang}

        if "ja" in langs:
            # pip install mecab-python3 unidic-lite
            import MeCab
            self.ja_tagger = MeCab.Tagger()

        if "ko" in langs:
            # pip install python-mecab-ko
            import mecab
            self.ko_tagger = mecab.MeCab()

        alnum = "0-9A-Za-z\xC0-\xFF"  # Latin-1
        alnum += "\u0100-\u017F"  # Latin Extended-A
        alnum += "\u0180-\u024F"  # Latin Extended-B
        alnum += "\u1E00-\u1EFF"  # Latin Extended Additional
        alnum += "\uAC00-\uD7AF"  # Hangul Syllables

        self.RE_NAN_L = re.compile("(?<=[^ %s])(?=[%s])" % (alnum, alnum))
        self.RE_NAN_R = re.compile("(?<=[%s])(?=[^ %s])" % (alnum, alnum))

        self.phrase_maxlen = phrase_maxlen

Exemple #20

0

Afficher le fichier

def PosWithSpace(sentence, mecab_ko=None, extend=False):
    if mecab_ko == None:
        mecab_ko = mecab.MeCab()

    #pos태그된곳에 띄어쓰기 추가 (태그는 SPACE)
    pos = mecab_ko.pos(sentence)
    sentence_sliced = sentence
    pos_with_space = []
    for p in pos:
        while sentence_sliced[0] == ' ':
            sentence_sliced = sentence_sliced[1:]
            pos_with_space.append((' ', 'SPACE'))
        sentence_sliced = sentence_sliced[len(p[0]):]
        pos_with_space.append(p)

    if extend:
        pos_extended = []
        for p in pos_with_space:
            pos_extended += [('', pp) for pp in p[1].split('+')]
            pos_extended[len(pos_extended) -
                         1] = (p[0], pos_extended[len(pos_extended) - 1][1])
        return pos_extended
    else:
        return pos_with_space

Exemple #21

0

Afficher le fichier

#            for i in range(0,len(json_data["sentence"][j]["dependency"])):
#                #json_lemma = json_data["sentence"][j]["morp"][i]["lemma"]
#                json_lemma = json_data["sentence"][j]["dependency"][i]["text"]
#                #json_type = json_data["sentence"][j]["morp"][i]["type"]
#                json_type = json_data["sentence"][j]["dependency"][i]["label"]
#                print(json_lemma + "\t" + json_type)
#            print("\n")
###############################################################################################################

####################################### 엑소브레인 데이터 개체명 추출 코드 ###########################################
import os
import sys
import mecab

os.chdir(os.getcwd())
mecab = mecab.MeCab()

sys.stdout = open("NEW_EXOBRAIN_NE_CORPUS_10000.txt", "w", -1, "utf-8")
f = open("EXOBRAIN_NE_CORPUS_10000.txt", "rt", -1, "utf-8")

while True:
    line = f.readline()
    if not line:
        break

    tokened_str = mecab.morphs(line)

    check = 0
    lemma = []
    type = []

Exemple #22

0

Afficher le fichier

Fichier : Rules.py Projet : thanatos8023/mzcb

    def __init__(self, req):
        ###################################
        # Request form
        # 'utt': User input utterance
        # 'code': Status Code
        #   API Server Code
        #   '7000': first input
        #   '7001': No necessery slots (Control_Engine_Start only)
        #   '8000': After pin input, yet processed previous command
        #
        #   Bluelink Server Code
        #   '200': Vehicle control Successed
        #   '4002': Invalid Request Body
        #   '4003': Invalid pin
        #   '4004': Duplicate request
        #   '4005': Unsupported control request
        #   '4011': Invalid access token
        #   '4081': Request timeout
        #   '5001': Internal Server Error
        #   '5031': Service Temporary Unavailable
        #   '5041': Gateway timeout
        # 'user_key': Identifier for users
        ####################################

        # 형태소 분석기 Mecab 불러오기
        self.mecab = mecab.MeCab()

        # API 서버에서 요청은 json 형식으로 전달된다.
        # 전달된 json을 parsing 해서 dictionary로 활용함
        # json 모듈 활용
        if type(req) == str:
            req_body = json.loads(req)
        else:
            req_body = req

        # API 서버에서 받은 요청을 활용하기 쉽도록 개별 변수에 저장한다.
        # utt: 사용자 입력 발화
        # code: 현재 상태를 나타내는 Status 코드 자세한 내용은 위의 주석 참조
        # user_key: 발화를 입력한 사용자를 구분하기 위한 id. 암호화 되어있음
        self.utt = req_body['utt']
        self.code = req_body['code']
        self.user_key = req_body['user_key']

        # 대화의 Depth 가 깊어질 경우,
        # 보존해야하는 정보가 발생함
        # (예, 시동 걸기 시에 온도 정보는 pin 입력 시까지 보존할 필요가 있음)
        # 이 정보는 req['options']에 저장됨
        # options 목록
        # 1. 실내 온도 ('temp')
        self.options = req_body['options']

        # LM Rule을 확인하기 위해서는 형태소 분석이 필요하다.
        # 사용자 입력 발화를 형태소 분석하는 코드
        # 이렇게 분석된 발화는 다음과 같은 형태를 가지게 된다.
        # (예)
        # 입력 발화: 시동 걸어
        # pos: [
        #   ('시동', 'NNG'),
        #   ('걸', 'VV'),
        #   ('어', 'EC'),
        # ]
        self.pos = self.mecab.pos(self.utt)

        # LM Rule 및 Response form 이 정리된 파일 불러오기
        # 이후로 DB 로 대체도 가능
        # 불러온 정보는 dictionary 형식으로 저장됨
        # 자세한 형식은 make_rule.py 참조

        # DB 교체
        self.dm = self.get_DM_from_DB()
        
        # 사용자 발화에서 intention 을 찾는 코드
        # intention 찾기에 실패한 경우, False 가 저장됨
        # 현재 pin 이나 온도가 입력될 경우 False 가 저장되고 있음
        self.intention = self.__get_intention__()

Exemple #23

0

Afficher le fichier

    def get(self):
        args = request.args
        received_message = args['msg']

        analysis_frame = select_analysis_table_frame()
        print(analysis_frame)

        question_nouns = ''

        final_slot_max_tfidf_value = 0.0
        final_slot_nouns = ''

        analysis_frame['slot_str'] = pd.Series(analysis_frame['slot_1'] + ' ' +
                                               analysis_frame['slot_2'] + ' ' +
                                               analysis_frame['slot_3'] + ' ' +
                                               analysis_frame['slot_4'])
        print(analysis_frame['slot_str'])

        tagger = mecab.MeCab()
        print("메캡 객체 생성 완료.")

        nouns = tagger.nouns(received_message)
        print("메캡 동사 추출 완료")
        for noun in nouns:
            question_nouns += noun + ' '

        result = ""

        print('question : ' + received_message)
        print('question_nouns : ' + question_nouns + '\n')

        for index, row in analysis_frame.iterrows():
            slots_nouns = ''
            nouns = tagger.nouns(row['slot_str'])
            for noun in nouns:
                slots_nouns += noun + ' '

            calc_list = list()
            calc_list.append(question_nouns)
            calc_list.append(slots_nouns)
            print('slots calc data : ' + str(calc_list) + '\n')

            tfidf_vectorizer = TfidfVectorizer(min_df=1)
            tfidf_matrix = tfidf_vectorizer.fit_transform(calc_list)
            document_distances = (tfidf_matrix * tfidf_matrix.T)
            print('slots and question - document_distances : ' +
                  str(document_distances.toarray()[0][1]))

            if document_distances.toarray()[0][1] >= 0.8:
                result += str(row['final_slot']) + ' at ' + str(
                    row['upload_date']) + ' / '

            nouns = tagger.nouns(row['final_slot'])
            final_slot_nouns = slots_nouns + ' '

            for noun in nouns:
                final_slot_nouns += noun + ' '

            calc_list = list()
            calc_list.append(question_nouns)
            calc_list.append(final_slot_nouns)

            print('final slot calc data : ' + str(calc_list) + '\n')

            tfidf_vectorizer = TfidfVectorizer(min_df=1)
            tfidf_matrix = tfidf_vectorizer.fit_transform(calc_list)
            document_distances = (tfidf_matrix * tfidf_matrix.T)
            print('final slot and question - document_distances : ' +
                  str(document_distances.toarray()[0][1]))

            if document_distances.toarray()[0][1] >= 0.85:
                result = row['data_content']
                break

        if result == "":
            result = "일치할 것으로 예상되는 값이 없습니다."

        # 딕셔너리로 변환
        # json_dict = dict(json.loads(return_m))
        # print(json_dict.keys())
        # print(json_dict.values())
        print("받은 메세지 :" + received_message)

        return jsonify({'result': result})

Exemple #24

0

Afficher le fichier

    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "nltk":
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            try:
                nltk.data.find("taggers/averaged_perceptron_tagger")
            except LookupError:
                nltk.download("averaged_perceptron_tagger")
            return PororoNLTKPosTagger(nltk, self.config)

        if self.config.n_model == "mecab-ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabPos(model, self.config)

        if self.config.n_model == "mecab-ipadic":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")
            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabJap(model, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba  # noqa
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            import jieba.posseg as jieba_pos

            model = jieba_pos
            return PororoJieba(model, self.config)

Exemple #25

0

Afficher le fichier

Fichier : tokenization.py Projet : peternara/pororo-nlp

    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)

Exemple #26

0

Afficher le fichier

Fichier : predownload.py Projet : syncdoth/mrc_serve

from fairseq.models.roberta import RobertaHubInterface, RobertaModel
import mecab
from pororo.models.brainbert.BrainRoBERTa import BrainRobertaHubInterface
from pororo.tasks.machine_reading_comprehension import PororoBertMrc
from pororo.tasks.utils.base import TaskConfig
from pororo.tasks.utils.download_utils import download_or_load
from pororo.tasks.utils.tokenizer import CustomTokenizer
from pororo.utils import postprocess_span
import torch

ckpt_dir = download_or_load("bert/brainbert.base.ko.korquad", "ko")
tok_path = download_or_load(f"tokenizers/bpe32k.ko.zip", "ko")

x = hub_utils.from_pretrained(
    ckpt_dir,
    "model.pt",
    ckpt_dir,
    load_checkpoint_heads=True
)
model = BrainRobertaHubInterface(
    x["args"],
    x["task"],
    x["models"][0],
    tok_path,
).to(torch.device("cuda"))

tagger = mecab.MeCab()
final = PororoBertMrc(model, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad"))

print(final("이름이 뭐야?", "이름은 시리야."))