Python pos_tagの例、pythainlp.tag.pos_tag Pythonの例

コード例 #1

0

ファイルを表示

ファイル: __init__.py プロジェクト: zkan/pythainlp

    def get_ner(self,text,postag=True):
        """
        Get NER from Thai NER.

        :param string text: thai text
        :param boolean postag: get postag (True) or get not postag (False)

        :return: list NER.

        **Example**::
            >>> from pythainlp.ner import thainer
            >>> ner=thainer()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'JSBR', 'O'), (' ', 'NCMN', 'O'), ('15', 'NCNM', 'B-DATE'), (' ', 'NCMN', 'I-DATE'), ('ก.ย.', 'CMTR', 'I-DATE'), (' ', 'NCMN', 'I-DATE'), ('61', 'NCNM', 'I-DATE'), (' ', 'NCMN', 'O'), ('ทดสอบ', 'VACT', 'O'), ('ระบบ', 'NCMN', 'O'), ('เวลา', 'NCMN', 'O'), (' ', 'NCMN', 'O'), ('14', 'NCNM', 'B-TIME'), (':', 'PUNC', 'I-TIME'), ('49', 'NCNM', 'I-TIME'), (' ', 'NCMN', 'I-TIME'), ('น.', 'CMTR', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",postag=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.word_cut=word_tokenize(text,engine=thaicut)
        self.list_word=pos_tag(self.word_cut,engine='perceptron')
        self.X_test = self.extract_features([(data,self.list_word[i][1]) for i,data in enumerate(self.word_cut)])
        self.y_=self.crf.predict_single(self.X_test)
        if postag:
            return [(self.word_cut[i],self.list_word[i][1],data) for i,data in enumerate(self.y_)]
        else:
            return [(self.word_cut[i],data) for i,data in enumerate(self.y_)]

コード例 #2

0

ファイルを表示

def postag(text):
    listtxt=[i for i in text.split('\n') if i!='']
    list_word=[]
    for data in listtxt:
        list_word.append(data.split('\t')[0])
    #print(text)
    list_word=pos_tag(list_word,engine='perceptron')
    text=""
    i=0
    for data in listtxt:
        text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
        i+=1
    return text

コード例 #3

0

ファイルを表示

ファイル: checkcalendar.py プロジェクト: saket404/HAFHAD

def checkcalendar(token, text):
    e = token
    if ("วันนี้" in text and e[0] != "วันนี้"):
        getEventsToday()
    elif ("พรุ่งนี้" in text and "พรุ่งนี้" not in e[0]):
        getEventsTomorrow()
    elif ("วันที่" in e):
        stopwords2 = stopwords.words('thai')
        stopwords1 = ['สิ', 'ดิ', 'หน่อย']

        filter_word1 = e
        for word in e:
            if word in stopwords2:
                filter_word1.remove(word)

        filter_word = [
            word1 for word1 in filter_word1 if word1 not in stopwords1
        ]

        pos_list = pos_tag(filter_word, engine='artagger')
        index = filter_word.index('วันที่')

        if (pos_list[index + 1][1] == "DCNM"
                or pos_list[index + 1][0] == "31"):
            checkdate = pos_list[index + 1][0]
            tz = pytz.timezone(('Asia/Bangkok'))
            d = datetime.datetime.now(tz=tz)
            date = d.strftime("%d")
            try:
                newDate = datetime.datetime(int(d.year), int(d.month),
                                            int(checkdate))
                correctDate = True
            except ValueError:
                correctDate = False

            if (correctDate == False):
                tts("วันที่ไม่ถูกต้องค่ะ")
            elif (int(checkdate) < int(date)):
                tts("วันที่ไม่ถูกต้องค่ะ")
            else:
                getEventsDate(int(checkdate))
                return 1

        else:
            tts("วันที่ไม่ถูกต้องค่ะ")
            return 1

    else:
        tts("โปรดระบุวันที่ด้วยค่ะ")
        return 1

コード例 #4

0

ファイルを表示

ファイル: all.py プロジェクト: totaeza31/MyProject

def DeepcutandPythai():

    valuesDeepcutandPythai = []
    text = "ทดสอบตัวตัดคำ ssนะจ้ะdsdsd/*-"
    cleans1 = str(text)
    cleans = cleans1.translate(
        {ord(c): ""
         for c in "\"'!@#$ %^&*,[](){};:./<>?|`~-=_+\\"})

    list_word = deepcut.tokenize(cleans)
    senten = pos_tag(list_word, corpus='orchid_ud')

    valuesDeepcutandPythai.append(senten)
    return valuesDeepcutandPythai

コード例 #5

0

ファイルを表示

ファイル: testtfilewordeachword.py プロジェクト: totaeza31/MyProject

def posWord(looplamda):
  data = keepfile(looplamda)
  if data != '[]':
    stringdata = str(data).replace("[","").replace("'","").replace(" ","").replace("]","").replace(",","")
    list_word = deepcut.tokenize(stringdata)
    posList_word = pos_tag(list_word, corpus='orchid_ud')
    text = str(posList_word).replace(" ","")\
        .replace("),(","|").replace("|"," ").replace("'","")\
        .replace(",","-").replace("(","").replace(")","")\
        .replace("]","").replace("[","")
    texts =text.split()
    return texts
  else:
    data = 'empty'
    stringdata = str(data).replace("[","").replace("'","").replace(" ","").replace("]","").replace(",","")
    list_word = deepcut.tokenize(stringdata)
    posList_word = pos_tag(list_word, corpus='orchid_ud')
    text = str(posList_word).replace(" ","")\
        .replace("),(","|").replace("|"," ").replace("'","")\
        .replace(",","-").replace("(","").replace(")","")\
        .replace("]","").replace("[","")
    texts =text.split()
    return texts

コード例 #6

0

ファイルを表示

def get_sent(text):
    global poson
    if poson:
        word_cut=[(i,pos) for i,pos in pos_tag(word_tokenize(text),engine="perceptron", corpus="orchid_ud")]
    else:
        word_cut=[(i,) for i in word_tokenize(text)]
    X_test =[punct_features(word_cut, i) for i in range(len(word_cut))]
    #print(X_test)
    #print(word_cut)
    y_=crf.predict_single(X_test)
    sent= [(word_cut[i][0],data) for i,data in enumerate(y_)]
    textsent=""
    for i,data in enumerate(sent):
        if i>0 and data[1]=="B-S":
            textsent+="|"
        textsent+=data[0]
    return textsent

コード例 #7

0

ファイルを表示

def segment(doc: List[str]) -> List[List[str]]:
    word_tags = pos_tag(doc, corpus="lst20")
    features = _extract_features(word_tags)
    word_markers = list(zip(doc, tagger.tag(features)))

    clauses = []
    temp = []
    len_doc = len(doc) - 1
    for i, word_marker in enumerate(word_markers):
        word, marker = word_marker
        if marker == "E_CLS" or i == len_doc:
            temp.append(word)
            clauses.append(temp)
            temp = []
        else:
            temp.append(word)

    return clauses

コード例 #8

0

ファイルを表示

def Opencsv(opens):
    with open(f'./FileCSV1/{opens}_clean_translated.csv',
              encoding="utf8") as csvfile:

        reader = csv.reader(csvfile)
        next(reader, None)
        values = []
        for row in reader:
            list_word = deepcut.tokenize(row[1])
            senten = pos_tag(list_word, corpus='orchid_ud')
            tests = [senten]
            test = str(tests)

            clean = test.replace(", (' ', 'PUNCT'), ", "],[")
            clean2 = clean.replace(" ", "")
            values.append(clean2)

        return values

コード例 #9

0

ファイルを表示

    def get_ner(
        self,
        text: str,
        pos: bool = True
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        Get named-entities in text

        :param string text: Thai text
        :param boolean pos: get Part-Of-Speech tag (True) or get not (False)

        :return: list of strings with name labels (and part-of-speech tags)

        **Example**::
            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'),
            (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'),
            ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'),
            (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'),
            (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
        self.__pos_tags = pos_tag(self.__tokens,
                                  engine="perceptron",
                                  corpus="orchid_ud")
        self.__x_test = self.__extract_features(self.__pos_tags)
        self.__y = self.crf.predict_single(self.__x_test)

        if pos:
            return [(self.__pos_tags[i][0], self.__pos_tags[i][1], data)
                    for i, data in enumerate(self.__y)]

        return [(self.__pos_tags[i][0], data)
                for i, data in enumerate(self.__y)]

コード例 #10

0

ファイルを表示

ファイル: named_entity.py プロジェクト: wannaphongcom/pythainlp

    def get_ner(
        self, text: str, pos: bool = True
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        Get named-entities in text

        :param string text: Thai text
        :param boolean pos: get Part-Of-Speech tag (True) or get not (False)

        :return: list of strings with name labels (and part-of-speech tags)

        **Example**::
            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'),
            (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'),
            ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'),
            (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'),
            (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
        self.__pos_tags = pos_tag(
            self.__tokens, engine="perceptron", corpus="orchid_ud"
        )
        self.__x_test = self.__extract_features(self.__pos_tags)
        self.__y = self.crf.predict_single(self.__x_test)

        if pos:
            return [
                (self.__pos_tags[i][0], self.__pos_tags[i][1], data)
                for i, data in enumerate(self.__y)
            ]

        return [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)]

コード例 #11

0

ファイルを表示

ファイル: app.py プロジェクト: muitanprasert/thai-fakenews-chatbot

def similarity(msg, html):
    from pythainlp.tag import pos_tag
    from pythainlp.tokenize import word_tokenize

    # tokenize and keep only words with "key" pos tags
    tokens = word_tokenize(msg, engine='newmm-safe', keep_whitespace=False)
    tuples = pos_tag(tokens, corpus='')
    key_tags = [
        'N', 'VACT', 'VATT', 'XVBB', 'XVMM', 'DCNM', 'DONM', 'CMTR', 'JCMP',
        'RPRE', 'PROPN'
    ]
    words = []
    for (word, tag) in tuples:
        if tag in key_tags or len(tag) == 0 or tag[0] in key_tags:
            words.append(word)

    # count no. words that appear in the article
    appear = sum(w in html for w in words)
    appear_ratio = appear / len(words)
    return len(tuples), appear_ratio

コード例 #12

0

ファイルを表示

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(tokens, engine="unigram",
                                     corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron",
                                     corpus="pud"))
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )

コード例 #13

0

ファイルを表示

    def get_L_sentences(self, s):
        from pythainlp import word_tokenize
        from pythainlp.tag import pos_tag  # , pos_tag_sents

        LTokens = word_tokenize(s)
        LPosTags = pos_tag(LTokens)

        LRtn = []

        for i, (segment, pos) in enumerate(LPosTags):
            LRtn.append(
                CubeItem(index=i,
                         word=segment.replace('_', ' '),
                         lemma=segment.replace('_', ' '),
                         upos=DOrchardToUDPOS[pos],
                         xpos='',
                         attrs='',
                         head='',
                         label='',
                         space_after='SpaceAfter=No'))

        return [LRtn]

コード例 #14

0

ファイルを表示

ファイル: clean_and_segmented_sentences.py プロジェクト: UDICatNCHU/new_kcm

def th(article):
    # Parameters
    # ----------
    # article = {'url': 'https://th.wikipedia.org/wiki?curid=792', 'title': 'โอเพนออฟฟิศดอตอ็อก', 'id': '792', 'text': 'โอเพนออฟฟิศดอตอ็อก\n\nโอเพนออฟฟิศดอตอ็อก ( ย่อว่า OO.o หรือ OOo) เป็นชุดซอฟต์แวร์สำนักงานที่ทำงานบนหลายระบบปฏิบัติการ เผยแพร่ในรูปแบบซอฟต์แวร์เสรี เขียนขึ้นโดยใช้ชุดเครื่องมือส่วนต่อประสานกราฟิกของตัวเอง รองรับรูปแบบโอเพนด็อกคิวเมนต์ (ODF) ซึ่งเป็นมาตรฐานไอเอสโอ/ไออีซีเพื่อการแลกเปลี่ยนข้อมูล และใช้เป็นรูปแบบแฟ้มพื้นฐาน อีกทั้งยังรองรับรูปแบบเอกสารจากไมโครซอฟท์ ออฟฟิศ และอื่น ๆ กระทั่งเดือนพฤศจิกายน พ.ศ. 2552 โอเพนออฟฟิศดอตอ็อกรองรับมากกว่า 110 ภาษา \n\nโอเพนออฟฟิศดอตอ็อกพัฒนาต่อยอดมาจากสตาร์ออฟฟิศ (StarOffice) ซอฟต์แวร์สำนักงานจากสตาร์วิชัน (StarVision) ซึ่งภายหลังถูกควบกิจการโดยซัน ไมโครซิสเต็มส์ เมื่อเดือนสิงหาคม พ.ศ. 2542 รหัสต้นฉบับของชุดซอฟต์แวร์นี้เผยแพร่เมื่อเดือนกรกฎาคม พ.ศ. 2543 ด้วยจุดประสงค์เพื่อช่วงชิงส่วนแบ่งตลาดจากไมโครซอฟท์ ออฟฟิศ โดยเพิ่มทางเลือกเสรีต่อผู้ใช้และผู้พัฒนา รุ่นหลัง ๆ ของสตาร์ออฟฟิศจะใช้โอเพนออฟฟิศดอตอ็อกเป็นพื้นฐานแทน และเพิ่มองค์ประกอบที่เป็นกรรมสิทธิ์ของสตาร์ออฟฟิศ \n\nซอฟต์แวร์และโครงการนี้อาจเรียกอย่างไม่เป็นทางการว่า "โอเพนออฟฟิศ" แต่ชื่อนี้เป็นเครื่องหมายการค้าของบริษัทในเนเธอร์แลนด์ ซึ่งก่อตั้งโดย Wouter Hanegraaff และมีการใช้ชื่อนี้ในออเรนจ์สหราชอาณาจักรอีกเช่นกัน จึงทำให้โครงการนี้ต้องใช้ชื่อว่า "โอเพนออฟฟิศดอตอ็อก" เป็นชื่อทางการ \n\nในประเทศไทย เคยมีการนำ โอเพนออฟฟิศดอตอ็อกมาพัฒนาต่อเพื่อให้ใช้งานภาษาไทยได้ โดยสองตัวหลักที่เป็นที่รู้จักกันในวงกว้าง คือ ปลาดาวออฟฟิศ ที่สนับสนุนโดย ซัน ไมโครซิสเต็มส์ (ประเทศไทย) และ ออฟฟิศทะเล ที่พัฒนาโดยเนคเทค \n\nในปีค.ศ. 1999 ซันไมโครซิสเต็มส์ได้ซื้อซอฟต์แวร์ สตาร์ออฟฟิศ จากบริษัทซอฟต์แวร์ของเยอรมนีชื่อ สตาร์ดิวิชัน ซันได้อนุญาตให้ใช้สตาร์ออฟฟิศ เวอร์ชัน 5.2 ได้โดยไม่มีค่าใช้จ่าย ในปีค.ศ. 2000 ซันได้เผยแพร่ซอร์สโค้ดของสตาร์ออฟฟิศภายใต้สัญญาอนุญาต LGPL และ Sun Industry Standards Source License (SISSL) เพื่อจะสร้างชุมชนโอเพนซอร์ส โครงการใหม่ที่ตั้งขึ้นมีชื่อว่า OpenOffice.org เว็บไซต์ของโอเพนออฟฟิศดอตอ็อกเริ่มเปิดใช้งานในเดือนตุลาคม ปี 2000\nโอเพนออฟฟิศดอตอ็อก 1.0 เปิดตัวในเดือนพฤษภาคม ค.ศ. 2002\n\nซันประกาศยุติการใช้งาน SISSL ในปี ค.ศ. 2005 โครงการโอเพนออฟฟิศดอตอ็อกจึงใช้เพียงสัญญาอนุญาตแบบ LGPL ในเวอร์ชันหลังจากนั้นมา โอเพนออฟฟิศดอตอ็อกเปิดตัวโปรแกรมเวอร์ชัน 2.0 ในเดือนตุลาคม ค.ศ. 2005 โดยใช้รูปแบบไฟล์ OpenDocument แทน OpenOffice.org XML\n\nโอเพนออฟฟิศดอตอ็อก 3.0 เปิดตัวในเดือนตุลาคม ค.ศ. 2008 โดยสามารถเปิดเอกสารในรูปแบบ Office Open XML ได้ และรองรับรูปแบบไฟล์ OpenDocument 1.2\n\nซันไมโครซิสเต็มส์ยังคงทำตลาดสตาร์ออฟฟิศเป็นซอฟต์แวร์เชิงพานิชย์ โดยใช้โอเพนออฟฟิศดอตอ็อกเป็นฐาน และเพิ่มความสามารถบางอย่างเข้าไป\n\n\n\n\n'}
    # for th
    from pythainlp.corpus import stopwords
    from pythainlp.tokenize import word_tokenize
    from pythainlp.tag import pos_tag
    import string
    punctuation = list(string.punctuation)
    extraPunctions = [
        '$', '。', '/', '(', ')', '.', 'ํ', '|', '๐', 'OO', 'o', 'ก', 'ข', 'ฃ',
        'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ',
        'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
        'ภ', 'ม', 'ย', 'ร', 'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ'
    ]
    for e in extraPunctions:
        punctuation.append(e)
    thstopwords = stopwords.words('thai')
    for line in article['text'].split('\n'):
        if line:
            line = [
                i for i in word_tokenize(line, engine='newmm')
                if i not in thstopwords and i not in punctuation
                and not i.startswith('$')
            ]
            if not line:
                continue
            POSlist = pos_tag(line, engine='old')
            result = []
            for keyword, pos in POSlist:
                if pos == None:
                    result.append((keyword, 'None'))
                else:
                    result.append((keyword, pos))
            yield result

コード例 #15

0

ファイルを表示

ファイル: proj.py プロジェクト: guiaria/HAFHAD

#e = ''.join(e)
print(e)
open_light = []
close_light = []
stopwords = stopwords.words('thai')
stopwords1 = ['สิ', 'ดิ', 'หน่อย']

filter_word1 = e
for word in e:
    if any(["เปิด" in word, "ปิด" in word]):
        continue
    if word in stopwords:
        filter_word1.remove(word)

filter_word = [word1 for word1 in filter_word1 if word1 not in stopwords1]
pos_list = pos_tag(filter_word, engine='artagger')
print(pos_list)

if (filter_word[0] == "เปิดไฟ"):
    open_light.append(filter_word[1])
    if len(filter_word) > 2:
        if (filter_word[2] == "เเละ" and filter_word[-1] != "เเละ"):
            open_light.append(filter_word[3])

if (filter_word[0] == "เปิด" and filter_word[1] == "ปลั๊ก"):
    open_light.append(filter_word[2])
    if len(filter_word) > 3:
        if (filter_word[3] == "เเละ" and filter_word[-1] != "เเละ"):
            open_light.append(filter_word[4])

if (filter_word[0] == "ปิดไฟ"):

コード例 #16

0

ファイルを表示

ファイル: get_data_AT.py プロジェクト: linloveorm/Withholding_Research

            # print("non-related : "+str(i)+" : "+record_item[i]['fields']['Short Description'])

        for itr in range(len(account_payable)):
            # print(record_item[i]['fields']['Account Payable'][0])
            # print(account_payable[itr]['id'])
            if (record_item[i]['fields']['Account Payable'][0] ==
                    account_payable[itr]['id']):
                # print("Purchase: " + record_item[i]['fields']['Account Payable'][0])
                # print("Payable: "+account_payable[itr]['id'])
                # payable_data.append([account_payable[itr]['fields']['Name'],account_payable[itr]['fields']['Payable ID']])

                arr_cutting.append([
                    i, record_item[i]['fields']['Category'],
                    record_item[i]['id'],
                    pos_tag(word_cutting, engine='unigram', corpus='pud'),
                    pos_tag(request_cutting, engine='unigram',
                            corpus='pud'), inv_amount, bf_vat, vat, pay_amount,
                    withholding, percentage,
                    [
                        account_payable[itr]['fields']['Name'],
                        account_payable[itr]['fields']['Payable ID']
                    ]
                ])  #Append data to the new list

                # print(payable_data)

        # arr_cutting.append([record_item[i]['id'],word_cutting,record_item[i]['fields']['THB Invoice Amount'],record_item[i]['fields']['Tax Withholding Amount'],payable_data])

# print(arr_cutting)

コード例 #17

0

ファイルを表示

ファイル: deepcut_test.py プロジェクト: Jirayut558/Seniorproject

def deep_word(text):
    #text = get_news(url)
    list_word = deepcut.tokenize(text)
    x = pos_tag(list_word, engine='artagger')
    print(x)

コード例 #18

0

ファイルを表示

ファイル: using2.py プロジェクト: wannaphong/thai-ner

def get_ner(text):
    word_cut=word_tokenize(text,engine=thaicut)
    list_word=pos_tag(word_cut,engine='perceptron')
    X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
    y_=crf.predict_single(X_test)
    return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]

コード例 #19

0

ファイルを表示

ファイル: prepare_data_for_question.py プロジェクト: noratap09/QA_question_type_2

        elif(in_name == "B-LEN" or in_name == "I-LEN"): return 9
        elif(in_name == "B-LAW" or in_name == "I-LAW"): return 10   
        elif(in_name == "B-PHONE" or in_name == "I-PHONE"): return 11 
        elif(in_name == "B-ZIP"): return 12
        elif(in_name == "B-EMAIL" or in_name == "I-EMAIL"): return 13  
        else: return 1

for count_data , data in enumerate(json_obj['data'][15000:15003],start=1):
    if(data['question_type']==2):
        pre_data = np.zeros((question_len,Word2Vec_len+pos_len),dtype=np.float32)
        question_id = data['question_id']
        print("QUESTION_ID: ",question_id)

        question = data['question'].lower()
        question = deepcut.tokenize(question)
        result_pos = pos_tag(question)

        for n_j,j in enumerate(question,start=0):
                if(j in model.wv.vocab.keys()):
                        pre_data[n_j,0:Word2Vec_len] = (model.wv.get_vector(j)+2.6491606)/(2.6491606+2.6473184)
                pre_data[n_j,Word2Vec_len+(pos_all.index(result_pos[n_j][1]))] = 1.0
        #draw_heat_map
        import heat_map
        temp = list()
        heat_map.make_heatmap("heatmap_question/"+str(question_id)+".png",temp,question,pre_data)
        #check_point
        np.save("train_data\input_question\input_B_"+str(question_id),pre_data)
        #print(pre_data)

#save final
#ck_point(all_input,all_output,math.ceil((last_data_count+1)/ck_point_time))

コード例 #20

0

ファイルを表示

	def testTagnew(self):
    		if sys.version_info > (3,3):
    				self.assertEqual(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'),[('ผม', 'PPRS'), ('รัก', 'VSTA'), ('คุณ', 'PPRS')])

コード例 #21

0

ファイルを表示

 def getPOSTagging(text):
     return pos_tag(text)

コード例 #22

0

ファイルを表示

ファイル: __init__.py プロジェクト: somjeat/pythainlp

	def test_tag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
		if sys.version_info >= (3,4):
			self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")

コード例 #23

0

ファイルを表示

ファイル: analysis.py プロジェクト: yakung/requirement-classification

def pos_tag_refined(words, remove_list=('NCMN', )):
    return [
        i for i in pos_tag(words) if i[1] not in remove_list and i[0] != ' '
    ]

コード例 #24

0

ファイルを表示

    def test_chunk_parse(self):
        tokens = ["ผม", "รัก", "คุณ"]

        w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
        self.assertIsNotNone(chunk_parse(w_p))

コード例 #25

0

ファイルを表示

ファイル: __init__.py プロジェクト: zkan/pythainlp

	def test_tag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
		self.assertEqual(pos_tag_sents([["ผม","กิน","ข้าว"],["แมว","วิ่ง"]]),[[('ผม', 'PPRS'), ('กิน', 'VACT'), ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]])
		if sys.version_info >= (3,4):
			self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")

コード例 #26

0

ファイルを表示

ファイル: remove_eng.py プロジェクト: btylk/Scrap

import pythainlp
import codecs
from pythainlp.tag import pos_tag 
from pythainlp.tokenize import word_tokenize
file = codecs.open('./data/data7.txt','r','utf-8')
i=0
# for line in file:
sentence = file.readline()
words = word_tokenize(sentence, engine="newmm",keep_whitespace=False)

print(pos_tag(words, engine="unigram", corpus="lst20"))

コード例 #27

0

ファイルを表示

	def testTag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])

コード例 #28

0

ファイルを表示

from pylexto import LexTo
from pythainlp.tokenize import etcc
from pythainlp.tag import pos_tag
from pythainlp.util import collate




text = "ค่าจ้าง ค่าเช่า ค่าตอบแทน ค่าบริการ จำเป็นต้องหัก ณ ที่จ่ายส่งสรรพากร"

text_cutting = word_tokenize(text, engine="deepcut")
text_collate = collate(text_cutting)
print("deepcut  :", text_cutting) #Engine ที่เหมาะกับงานเราที่สุด


text_tag_list = pos_tag(text_collate)
print(text_tag_list)

text_pos = []
pos_vact = []
pos = ""

for itr in range(len(text_tag_list)):
    text_pos.append([text_tag_list[itr][0],text_tag_list[itr][1]])

         
    if text_pos[itr][1] == None:
        print("Word : "+text_pos[itr][0])
        pos = input()
        print("POS : "+pos)
        text_pos[itr][1] = pos

コード例 #29

0

ファイルを表示

ファイル: prepare_data.py プロジェクト: noratap09/QA_question_type_2

        txt = [x.lower() for x in txt]

        #print("TXT : ",txt)

        ck_answer = list()
        answer = data['answer']

        print("t ->", math.ceil(len(txt) / slide_size))
        for i in range(0, math.ceil(len(txt) / slide_size)):
            pre_data = np.zeros((input_text_len, Word2Vec_len + pos_len),
                                dtype=np.float32)
            pre_ans = np.zeros((num_class), dtype=np.float32)
            input_text = txt[i * slide_size:i * slide_size + input_text_len -
                             1]

            result_pos = pos_tag(input_text)

            print("Tag : ", i)
            print("CK_ANS : ", answer)
            print("QUESTION : ", question)
            print("INPUT TXT : ", "".join(input_text))
            #print(len(txt)," > ",i,":",i*slide_size,"-",i*slide_size+input_text_len)
            #get input feature

            for n_j, j in enumerate(input_text, start=0):
                if (j in model.wv.vocab.keys()):
                    pre_data[n_j, 0:Word2Vec_len] = (model.wv.get_vector(j) +
                                                     2.6491606) / (2.6491606 +
                                                                   2.6473184)
                pre_data[n_j, Word2Vec_len +
                         (pos_all.index(result_pos[n_j][1]))] = 1.0

コード例 #30

0

ファイルを表示


text_file = open("corpus/665.txt", encoding="utf8")
data = [text_file.read()]

data[0] = cleanhtml(data[0])
print("Raw  :", data)

tokendata = [list(word_tokenize(i, engine='newmm')) for i in data]
print("Tokenized    :", tokendata)
# model = Word2Vec(tokendata,size=3, min_count=1,window=5,sg=1)
# model.save('665txt-3d.bin')
modelW2V = Word2Vec.load("665txt-3d.bin")
words = list(modelW2V.wv.vocab)
print("Words    :", words)
dataTag = pos_tag(tokendata[0], engine='old')
print(dataTag)

nouns = []
verbs = []
adverbs = []
prepo = []

for _, x in dataTag:

    if (x == 'NCMN' and _.__len__() > 1 and _ not in nouns
            and nouns.__len__() < 20):
        nouns.append((_, 0))

    if (x == 'VACT' and _.__len__() > 1 and _ not in verbs
            and verbs.__len__() < 20):

コード例 #31

0

ファイルを表示

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])
        self.assertEqual(
            pos_tag(["นักเรียน", "ถาม", "ครู"]),
            [("นักเรียน", "NCMN"), ("ถาม", "VACT"), ("ครู", "NCMN")],
        )
        self.assertEqual(
            len(pos_tag(["การ", "เดินทาง", "มี", "ความ", "ท้าทาย"])), 5)

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])
        self.assertIsNotNone(pos_tag(tokens, engine="unigram",
                                     corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertTrue(
            pos_tag(["การ", "รัฐประหาร"], corpus="orchid_ud")[0][1], "NOUN")
        self.assertTrue(
            pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN")

        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid_ud"), [])
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron",
                                     corpus="pud"))

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )

コード例 #32

0

ファイルを表示

def Pos(data):
    pos = pos_tag(data, corpus='orchid_ud')
    return pos

コード例 #33

0

ファイルを表示

ファイル: named_entity.py プロジェクト: pacharapol4066/Product-Position-SNA

    def get_ner(
        self,
        text: str,
        pos: bool = True,
        tag: bool = False
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        This function tags named-entitiy from text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: To include POS tags in the results (`True`) or
                            exclude (`False`). The defualt value is `True`
        :param bool tag: output like html tag.
        :return: a list of tuple associated with tokenized word, NER tag,
                 POS tag (if the parameter `pos` is specified as `True`),
                 and output like html tag (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuple associated with tokenized
                 word and NER tag
        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

        :Note:
            * For the POS tags to be included in the results, this function
              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
              and corpus as orchid_ud`.

        :Example:

            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>>
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
            ('น.', 'NOUN', 'I-TIME')]
            >>>
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            pos=False)
            [('วันที่', 'O'), (' ', 'O'),
            ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
            ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'),
            ('เวลา', 'O'), (' ', 'O'),
            ('14', 'B-TIME'), (':', 'I-TIME'),
            ('49', 'I-TIME'), (' ', 'I-TIME'),
            ('น.', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            tag=True)
            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
        """
        #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False)
        tokens = _tokenizer.word_tokenize(text)
        pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
        x_test = ThaiNameTagger.__extract_features(pos_tags)
        y = self.crf.tag(x_test)

        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

        if tag:
            temp = ""
            sent = ""
            for idx, (word, ner) in enumerate(sent_ner):
                if ner.startswith("B-") and temp != "":
                    sent += "</" + temp + ">"
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner.startswith("B-"):
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner == "O" and temp != "":
                    sent += "</" + temp + ">"
                    temp = ""
                sent += word

                if idx == len(sent_ner) - 1 and temp != "":
                    sent += "</" + temp + ">"

            return sent

        if pos:
            return [(pos_tags[i][0], pos_tags[i][1], data)
                    for i, data in enumerate(y)]

        return sent_ner

コード例 #34

0

ファイルを表示

ファイル: __init__.py プロジェクト: wannaphongcom/pythainlp

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud"))
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(None, engine="artagger"))
        self.assertIsNotNone(pos_tag([], engine="artagger"))
        self.assertIsNotNone(pos_tag(tokens, engine="artagger"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )