Ejemplos de pos_tag en Python, ejemplos de pythainlp.tag.pos_tag en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: __init__.py Proyecto: zkan/pythainlp

    def get_ner(self,text,postag=True):
        """
        Get NER from Thai NER.

        :param string text: thai text
        :param boolean postag: get postag (True) or get not postag (False)

        :return: list NER.

        **Example**::
            >>> from pythainlp.ner import thainer
            >>> ner=thainer()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'JSBR', 'O'), (' ', 'NCMN', 'O'), ('15', 'NCNM', 'B-DATE'), (' ', 'NCMN', 'I-DATE'), ('ก.ย.', 'CMTR', 'I-DATE'), (' ', 'NCMN', 'I-DATE'), ('61', 'NCNM', 'I-DATE'), (' ', 'NCMN', 'O'), ('ทดสอบ', 'VACT', 'O'), ('ระบบ', 'NCMN', 'O'), ('เวลา', 'NCMN', 'O'), (' ', 'NCMN', 'O'), ('14', 'NCNM', 'B-TIME'), (':', 'PUNC', 'I-TIME'), ('49', 'NCNM', 'I-TIME'), (' ', 'NCMN', 'I-TIME'), ('น.', 'CMTR', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",postag=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.word_cut=word_tokenize(text,engine=thaicut)
        self.list_word=pos_tag(self.word_cut,engine='perceptron')
        self.X_test = self.extract_features([(data,self.list_word[i][1]) for i,data in enumerate(self.word_cut)])
        self.y_=self.crf.predict_single(self.X_test)
        if postag:
            return [(self.word_cut[i],self.list_word[i][1],data) for i,data in enumerate(self.y_)]
        else:
            return [(self.word_cut[i],data) for i,data in enumerate(self.y_)]

Ejemplo n.º 2

0

Mostrar archivo

def postag(text):
    listtxt=[i for i in text.split('\n') if i!='']
    list_word=[]
    for data in listtxt:
        list_word.append(data.split('\t')[0])
    #print(text)
    list_word=pos_tag(list_word,engine='perceptron')
    text=""
    i=0
    for data in listtxt:
        text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
        i+=1
    return text

Ejemplo n.º 3

0

Mostrar archivo

Archivo: checkcalendar.py Proyecto: saket404/HAFHAD

def checkcalendar(token, text):
    e = token
    if ("วันนี้" in text and e[0] != "วันนี้"):
        getEventsToday()
    elif ("พรุ่งนี้" in text and "พรุ่งนี้" not in e[0]):
        getEventsTomorrow()
    elif ("วันที่" in e):
        stopwords2 = stopwords.words('thai')
        stopwords1 = ['สิ', 'ดิ', 'หน่อย']

        filter_word1 = e
        for word in e:
            if word in stopwords2:
                filter_word1.remove(word)

        filter_word = [
            word1 for word1 in filter_word1 if word1 not in stopwords1
        ]

        pos_list = pos_tag(filter_word, engine='artagger')
        index = filter_word.index('วันที่')

        if (pos_list[index + 1][1] == "DCNM"
                or pos_list[index + 1][0] == "31"):
            checkdate = pos_list[index + 1][0]
            tz = pytz.timezone(('Asia/Bangkok'))
            d = datetime.datetime.now(tz=tz)
            date = d.strftime("%d")
            try:
                newDate = datetime.datetime(int(d.year), int(d.month),
                                            int(checkdate))
                correctDate = True
            except ValueError:
                correctDate = False

            if (correctDate == False):
                tts("วันที่ไม่ถูกต้องค่ะ")
            elif (int(checkdate) < int(date)):
                tts("วันที่ไม่ถูกต้องค่ะ")
            else:
                getEventsDate(int(checkdate))
                return 1

        else:
            tts("วันที่ไม่ถูกต้องค่ะ")
            return 1

    else:
        tts("โปรดระบุวันที่ด้วยค่ะ")
        return 1

Ejemplo n.º 4

0

Mostrar archivo

Archivo: all.py Proyecto: totaeza31/MyProject

def DeepcutandPythai():

    valuesDeepcutandPythai = []
    text = "ทดสอบตัวตัดคำ ssนะจ้ะdsdsd/*-"
    cleans1 = str(text)
    cleans = cleans1.translate(
        {ord(c): ""
         for c in "\"'!@#$ %^&*,[](){};:./<>?|`~-=_+\\"})

    list_word = deepcut.tokenize(cleans)
    senten = pos_tag(list_word, corpus='orchid_ud')

    valuesDeepcutandPythai.append(senten)
    return valuesDeepcutandPythai

Ejemplo n.º 5

0

Mostrar archivo

Archivo: testtfilewordeachword.py Proyecto: totaeza31/MyProject

def posWord(looplamda):
  data = keepfile(looplamda)
  if data != '[]':
    stringdata = str(data).replace("[","").replace("'","").replace(" ","").replace("]","").replace(",","")
    list_word = deepcut.tokenize(stringdata)
    posList_word = pos_tag(list_word, corpus='orchid_ud')
    text = str(posList_word).replace(" ","")\
        .replace("),(","|").replace("|"," ").replace("'","")\
        .replace(",","-").replace("(","").replace(")","")\
        .replace("]","").replace("[","")
    texts =text.split()
    return texts
  else:
    data = 'empty'
    stringdata = str(data).replace("[","").replace("'","").replace(" ","").replace("]","").replace(",","")
    list_word = deepcut.tokenize(stringdata)
    posList_word = pos_tag(list_word, corpus='orchid_ud')
    text = str(posList_word).replace(" ","")\
        .replace("),(","|").replace("|"," ").replace("'","")\
        .replace(",","-").replace("(","").replace(")","")\
        .replace("]","").replace("[","")
    texts =text.split()
    return texts

Ejemplo n.º 6

0

Mostrar archivo

def get_sent(text):
    global poson
    if poson:
        word_cut=[(i,pos) for i,pos in pos_tag(word_tokenize(text),engine="perceptron", corpus="orchid_ud")]
    else:
        word_cut=[(i,) for i in word_tokenize(text)]
    X_test =[punct_features(word_cut, i) for i in range(len(word_cut))]
    #print(X_test)
    #print(word_cut)
    y_=crf.predict_single(X_test)
    sent= [(word_cut[i][0],data) for i,data in enumerate(y_)]
    textsent=""
    for i,data in enumerate(sent):
        if i>0 and data[1]=="B-S":
            textsent+="|"
        textsent+=data[0]
    return textsent

Ejemplo n.º 7

0

Mostrar archivo

def segment(doc: List[str]) -> List[List[str]]:
    word_tags = pos_tag(doc, corpus="lst20")
    features = _extract_features(word_tags)
    word_markers = list(zip(doc, tagger.tag(features)))

    clauses = []
    temp = []
    len_doc = len(doc) - 1
    for i, word_marker in enumerate(word_markers):
        word, marker = word_marker
        if marker == "E_CLS" or i == len_doc:
            temp.append(word)
            clauses.append(temp)
            temp = []
        else:
            temp.append(word)

    return clauses

Ejemplo n.º 8

0

Mostrar archivo

def Opencsv(opens):
    with open(f'./FileCSV1/{opens}_clean_translated.csv',
              encoding="utf8") as csvfile:

        reader = csv.reader(csvfile)
        next(reader, None)
        values = []
        for row in reader:
            list_word = deepcut.tokenize(row[1])
            senten = pos_tag(list_word, corpus='orchid_ud')
            tests = [senten]
            test = str(tests)

            clean = test.replace(", (' ', 'PUNCT'), ", "],[")
            clean2 = clean.replace(" ", "")
            values.append(clean2)

        return values

Ejemplo n.º 9

0

Mostrar archivo

    def get_ner(
        self,
        text: str,
        pos: bool = True
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        Get named-entities in text

        :param string text: Thai text
        :param boolean pos: get Part-Of-Speech tag (True) or get not (False)

        :return: list of strings with name labels (and part-of-speech tags)

        **Example**::
            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'),
            (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'),
            ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'),
            (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'),
            (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
        self.__pos_tags = pos_tag(self.__tokens,
                                  engine="perceptron",
                                  corpus="orchid_ud")
        self.__x_test = self.__extract_features(self.__pos_tags)
        self.__y = self.crf.predict_single(self.__x_test)

        if pos:
            return [(self.__pos_tags[i][0], self.__pos_tags[i][1], data)
                    for i, data in enumerate(self.__y)]

        return [(self.__pos_tags[i][0], data)
                for i, data in enumerate(self.__y)]

Ejemplo n.º 10

0

Mostrar archivo

Archivo: named_entity.py Proyecto: wannaphongcom/pythainlp

    def get_ner(
        self, text: str, pos: bool = True
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        Get named-entities in text

        :param string text: Thai text
        :param boolean pos: get Part-Of-Speech tag (True) or get not (False)

        :return: list of strings with name labels (and part-of-speech tags)

        **Example**::
            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'),
            (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'),
            (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'),
            ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'),
            (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False)
            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'),
            (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')]
        """
        self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
        self.__pos_tags = pos_tag(
            self.__tokens, engine="perceptron", corpus="orchid_ud"
        )
        self.__x_test = self.__extract_features(self.__pos_tags)
        self.__y = self.crf.predict_single(self.__x_test)

        if pos:
            return [
                (self.__pos_tags[i][0], self.__pos_tags[i][1], data)
                for i, data in enumerate(self.__y)
            ]

        return [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)]

Ejemplo n.º 11

0

Mostrar archivo

Archivo: app.py Proyecto: muitanprasert/thai-fakenews-chatbot

def similarity(msg, html):
    from pythainlp.tag import pos_tag
    from pythainlp.tokenize import word_tokenize

    # tokenize and keep only words with "key" pos tags
    tokens = word_tokenize(msg, engine='newmm-safe', keep_whitespace=False)
    tuples = pos_tag(tokens, corpus='')
    key_tags = [
        'N', 'VACT', 'VATT', 'XVBB', 'XVMM', 'DCNM', 'DONM', 'CMTR', 'JCMP',
        'RPRE', 'PROPN'
    ]
    words = []
    for (word, tag) in tuples:
        if tag in key_tags or len(tag) == 0 or tag[0] in key_tags:
            words.append(word)

    # count no. words that appear in the article
    appear = sum(w in html for w in words)
    appear_ratio = appear / len(words)
    return len(tuples), appear_ratio

Ejemplo n.º 12

0

Mostrar archivo

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(tokens, engine="unigram",
                                     corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron",
                                     corpus="pud"))
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )

Ejemplo n.º 13

0

Mostrar archivo

    def get_L_sentences(self, s):
        from pythainlp import word_tokenize
        from pythainlp.tag import pos_tag  # , pos_tag_sents

        LTokens = word_tokenize(s)
        LPosTags = pos_tag(LTokens)

        LRtn = []

        for i, (segment, pos) in enumerate(LPosTags):
            LRtn.append(
                CubeItem(index=i,
                         word=segment.replace('_', ' '),
                         lemma=segment.replace('_', ' '),
                         upos=DOrchardToUDPOS[pos],
                         xpos='',
                         attrs='',
                         head='',
                         label='',
                         space_after='SpaceAfter=No'))

        return [LRtn]

Ejemplo n.º 14

0

Mostrar archivo

Archivo: clean_and_segmented_sentences.py Proyecto: UDICatNCHU/new_kcm

def th(article):
    # Parameters
    # ----------
    # article = {'url': 'https://th.wikipedia.org/wiki?curid=792', 'title': 'โอเพนออฟฟิศดอตอ็อก', 'id': '792', 'text': 'โอเพนออฟฟิศดอตอ็อก\n\nโอเพนออฟฟิศดอตอ็อก ( ย่อว่า OO.o หรือ OOo) เป็นชุดซอฟต์แวร์สำนักงานที่ทำงานบนหลายระบบปฏิบัติการ เผยแพร่ในรูปแบบซอฟต์แวร์เสรี เขียนขึ้นโดยใช้ชุดเครื่องมือส่วนต่อประสานกราฟิกของตัวเอง รองรับรูปแบบโอเพนด็อกคิวเมนต์ (ODF) ซึ่งเป็นมาตรฐานไอเอสโอ/ไออีซีเพื่อการแลกเปลี่ยนข้อมูล และใช้เป็นรูปแบบแฟ้มพื้นฐาน อีกทั้งยังรองรับรูปแบบเอกสารจากไมโครซอฟท์ ออฟฟิศ และอื่น ๆ กระทั่งเดือนพฤศจิกายน พ.ศ. 2552 โอเพนออฟฟิศดอตอ็อกรองรับมากกว่า 110 ภาษา \n\nโอเพนออฟฟิศดอตอ็อกพัฒนาต่อยอดมาจากสตาร์ออฟฟิศ (StarOffice) ซอฟต์แวร์สำนักงานจากสตาร์วิชัน (StarVision) ซึ่งภายหลังถูกควบกิจการโดยซัน ไมโครซิสเต็มส์ เมื่อเดือนสิงหาคม พ.ศ. 2542 รหัสต้นฉบับของชุดซอฟต์แวร์นี้เผยแพร่เมื่อเดือนกรกฎาคม พ.ศ. 2543 ด้วยจุดประสงค์เพื่อช่วงชิงส่วนแบ่งตลาดจากไมโครซอฟท์ ออฟฟิศ โดยเพิ่มทางเลือกเสรีต่อผู้ใช้และผู้พัฒนา รุ่นหลัง ๆ ของสตาร์ออฟฟิศจะใช้โอเพนออฟฟิศดอตอ็อกเป็นพื้นฐานแทน และเพิ่มองค์ประกอบที่เป็นกรรมสิทธิ์ของสตาร์ออฟฟิศ \n\nซอฟต์แวร์และโครงการนี้อาจเรียกอย่างไม่เป็นทางการว่า "โอเพนออฟฟิศ" แต่ชื่อนี้เป็นเครื่องหมายการค้าของบริษัทในเนเธอร์แลนด์ ซึ่งก่อตั้งโดย Wouter Hanegraaff และมีการใช้ชื่อนี้ในออเรนจ์สหราชอาณาจักรอีกเช่นกัน จึงทำให้โครงการนี้ต้องใช้ชื่อว่า "โอเพนออฟฟิศดอตอ็อก" เป็นชื่อทางการ \n\nในประเทศไทย เคยมีการนำ โอเพนออฟฟิศดอตอ็อกมาพัฒนาต่อเพื่อให้ใช้งานภาษาไทยได้ โดยสองตัวหลักที่เป็นที่รู้จักกันในวงกว้าง คือ ปลาดาวออฟฟิศ ที่สนับสนุนโดย ซัน ไมโครซิสเต็มส์ (ประเทศไทย) และ ออฟฟิศทะเล ที่พัฒนาโดยเนคเทค \n\nในปีค.ศ. 1999 ซันไมโครซิสเต็มส์ได้ซื้อซอฟต์แวร์ สตาร์ออฟฟิศ จากบริษัทซอฟต์แวร์ของเยอรมนีชื่อ สตาร์ดิวิชัน ซันได้อนุญาตให้ใช้สตาร์ออฟฟิศ เวอร์ชัน 5.2 ได้โดยไม่มีค่าใช้จ่าย ในปีค.ศ. 2000 ซันได้เผยแพร่ซอร์สโค้ดของสตาร์ออฟฟิศภายใต้สัญญาอนุญาต LGPL และ Sun Industry Standards Source License (SISSL) เพื่อจะสร้างชุมชนโอเพนซอร์ส โครงการใหม่ที่ตั้งขึ้นมีชื่อว่า OpenOffice.org เว็บไซต์ของโอเพนออฟฟิศดอตอ็อกเริ่มเปิดใช้งานในเดือนตุลาคม ปี 2000\nโอเพนออฟฟิศดอตอ็อก 1.0 เปิดตัวในเดือนพฤษภาคม ค.ศ. 2002\n\nซันประกาศยุติการใช้งาน SISSL ในปี ค.ศ. 2005 โครงการโอเพนออฟฟิศดอตอ็อกจึงใช้เพียงสัญญาอนุญาตแบบ LGPL ในเวอร์ชันหลังจากนั้นมา โอเพนออฟฟิศดอตอ็อกเปิดตัวโปรแกรมเวอร์ชัน 2.0 ในเดือนตุลาคม ค.ศ. 2005 โดยใช้รูปแบบไฟล์ OpenDocument แทน OpenOffice.org XML\n\nโอเพนออฟฟิศดอตอ็อก 3.0 เปิดตัวในเดือนตุลาคม ค.ศ. 2008 โดยสามารถเปิดเอกสารในรูปแบบ Office Open XML ได้ และรองรับรูปแบบไฟล์ OpenDocument 1.2\n\nซันไมโครซิสเต็มส์ยังคงทำตลาดสตาร์ออฟฟิศเป็นซอฟต์แวร์เชิงพานิชย์ โดยใช้โอเพนออฟฟิศดอตอ็อกเป็นฐาน และเพิ่มความสามารถบางอย่างเข้าไป\n\n\n\n\n'}
    # for th
    from pythainlp.corpus import stopwords
    from pythainlp.tokenize import word_tokenize
    from pythainlp.tag import pos_tag
    import string
    punctuation = list(string.punctuation)
    extraPunctions = [
        '$', '。', '/', '(', ')', '.', 'ํ', '|', '๐', 'OO', 'o', 'ก', 'ข', 'ฃ',
        'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ',
        'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
        'ภ', 'ม', 'ย', 'ร', 'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ'
    ]
    for e in extraPunctions:
        punctuation.append(e)
    thstopwords = stopwords.words('thai')
    for line in article['text'].split('\n'):
        if line:
            line = [
                i for i in word_tokenize(line, engine='newmm')
                if i not in thstopwords and i not in punctuation
                and not i.startswith('$')
            ]
            if not line:
                continue
            POSlist = pos_tag(line, engine='old')
            result = []
            for keyword, pos in POSlist:
                if pos == None:
                    result.append((keyword, 'None'))
                else:
                    result.append((keyword, pos))
            yield result

Ejemplo n.º 15

0

Mostrar archivo

Archivo: proj.py Proyecto: guiaria/HAFHAD

#e = ''.join(e)
print(e)
open_light = []
close_light = []
stopwords = stopwords.words('thai')
stopwords1 = ['สิ', 'ดิ', 'หน่อย']

filter_word1 = e
for word in e:
    if any(["เปิด" in word, "ปิด" in word]):
        continue
    if word in stopwords:
        filter_word1.remove(word)

filter_word = [word1 for word1 in filter_word1 if word1 not in stopwords1]
pos_list = pos_tag(filter_word, engine='artagger')
print(pos_list)

if (filter_word[0] == "เปิดไฟ"):
    open_light.append(filter_word[1])
    if len(filter_word) > 2:
        if (filter_word[2] == "เเละ" and filter_word[-1] != "เเละ"):
            open_light.append(filter_word[3])

if (filter_word[0] == "เปิด" and filter_word[1] == "ปลั๊ก"):
    open_light.append(filter_word[2])
    if len(filter_word) > 3:
        if (filter_word[3] == "เเละ" and filter_word[-1] != "เเละ"):
            open_light.append(filter_word[4])

if (filter_word[0] == "ปิดไฟ"):

Ejemplo n.º 16

0

Mostrar archivo

Archivo: get_data_AT.py Proyecto: linloveorm/Withholding_Research

            # print("non-related : "+str(i)+" : "+record_item[i]['fields']['Short Description'])

        for itr in range(len(account_payable)):
            # print(record_item[i]['fields']['Account Payable'][0])
            # print(account_payable[itr]['id'])
            if (record_item[i]['fields']['Account Payable'][0] ==
                    account_payable[itr]['id']):
                # print("Purchase: " + record_item[i]['fields']['Account Payable'][0])
                # print("Payable: "+account_payable[itr]['id'])
                # payable_data.append([account_payable[itr]['fields']['Name'],account_payable[itr]['fields']['Payable ID']])

                arr_cutting.append([
                    i, record_item[i]['fields']['Category'],
                    record_item[i]['id'],
                    pos_tag(word_cutting, engine='unigram', corpus='pud'),
                    pos_tag(request_cutting, engine='unigram',
                            corpus='pud'), inv_amount, bf_vat, vat, pay_amount,
                    withholding, percentage,
                    [
                        account_payable[itr]['fields']['Name'],
                        account_payable[itr]['fields']['Payable ID']
                    ]
                ])  #Append data to the new list

                # print(payable_data)

        # arr_cutting.append([record_item[i]['id'],word_cutting,record_item[i]['fields']['THB Invoice Amount'],record_item[i]['fields']['Tax Withholding Amount'],payable_data])

# print(arr_cutting)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: deepcut_test.py Proyecto: Jirayut558/Seniorproject

def deep_word(text):
    #text = get_news(url)
    list_word = deepcut.tokenize(text)
    x = pos_tag(list_word, engine='artagger')
    print(x)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: using2.py Proyecto: wannaphong/thai-ner

def get_ner(text):
    word_cut=word_tokenize(text,engine=thaicut)
    list_word=pos_tag(word_cut,engine='perceptron')
    X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
    y_=crf.predict_single(X_test)
    return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]

Ejemplo n.º 19

0

Mostrar archivo

Archivo: prepare_data_for_question.py Proyecto: noratap09/QA_question_type_2

        elif(in_name == "B-LEN" or in_name == "I-LEN"): return 9
        elif(in_name == "B-LAW" or in_name == "I-LAW"): return 10   
        elif(in_name == "B-PHONE" or in_name == "I-PHONE"): return 11 
        elif(in_name == "B-ZIP"): return 12
        elif(in_name == "B-EMAIL" or in_name == "I-EMAIL"): return 13  
        else: return 1

for count_data , data in enumerate(json_obj['data'][15000:15003],start=1):
    if(data['question_type']==2):
        pre_data = np.zeros((question_len,Word2Vec_len+pos_len),dtype=np.float32)
        question_id = data['question_id']
        print("QUESTION_ID: ",question_id)

        question = data['question'].lower()
        question = deepcut.tokenize(question)
        result_pos = pos_tag(question)

        for n_j,j in enumerate(question,start=0):
                if(j in model.wv.vocab.keys()):
                        pre_data[n_j,0:Word2Vec_len] = (model.wv.get_vector(j)+2.6491606)/(2.6491606+2.6473184)
                pre_data[n_j,Word2Vec_len+(pos_all.index(result_pos[n_j][1]))] = 1.0
        #draw_heat_map
        import heat_map
        temp = list()
        heat_map.make_heatmap("heatmap_question/"+str(question_id)+".png",temp,question,pre_data)
        #check_point
        np.save("train_data\input_question\input_B_"+str(question_id),pre_data)
        #print(pre_data)

#save final
#ck_point(all_input,all_output,math.ceil((last_data_count+1)/ck_point_time))

Ejemplo n.º 20

0

Mostrar archivo

	def testTagnew(self):
    		if sys.version_info > (3,3):
    				self.assertEqual(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'),[('ผม', 'PPRS'), ('รัก', 'VSTA'), ('คุณ', 'PPRS')])

Ejemplo n.º 21

0

Mostrar archivo

 def getPOSTagging(text):
     return pos_tag(text)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: __init__.py Proyecto: somjeat/pythainlp

	def test_tag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
		if sys.version_info >= (3,4):
			self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")

Ejemplo n.º 23

0

Mostrar archivo

Archivo: analysis.py Proyecto: yakung/requirement-classification

def pos_tag_refined(words, remove_list=('NCMN', )):
    return [
        i for i in pos_tag(words) if i[1] not in remove_list and i[0] != ' '
    ]

Ejemplo n.º 24

0

Mostrar archivo

    def test_chunk_parse(self):
        tokens = ["ผม", "รัก", "คุณ"]

        w_p = pos_tag(tokens, engine="perceptron", corpus="orchid")
        self.assertIsNotNone(chunk_parse(w_p))

Ejemplo n.º 25

0

Mostrar archivo

Archivo: __init__.py Proyecto: zkan/pythainlp

	def test_tag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])
		self.assertEqual(pos_tag_sents([["ผม","กิน","ข้าว"],["แมว","วิ่ง"]]),[[('ผม', 'PPRS'), ('กิน', 'VACT'), ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]])
		if sys.version_info >= (3,4):
			self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")

Ejemplo n.º 26

0

Mostrar archivo

Archivo: remove_eng.py Proyecto: btylk/Scrap

import pythainlp
import codecs
from pythainlp.tag import pos_tag 
from pythainlp.tokenize import word_tokenize
file = codecs.open('./data/data7.txt','r','utf-8')
i=0
# for line in file:
sentence = file.readline()
words = word_tokenize(sentence, engine="newmm",keep_whitespace=False)

print(pos_tag(words, engine="unigram", corpus="lst20"))

Ejemplo n.º 27

0

Mostrar archivo

	def testTag(self):
		self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')])

Ejemplo n.º 28

0

Mostrar archivo

from pylexto import LexTo
from pythainlp.tokenize import etcc
from pythainlp.tag import pos_tag
from pythainlp.util import collate




text = "ค่าจ้าง ค่าเช่า ค่าตอบแทน ค่าบริการ จำเป็นต้องหัก ณ ที่จ่ายส่งสรรพากร"

text_cutting = word_tokenize(text, engine="deepcut")
text_collate = collate(text_cutting)
print("deepcut  :", text_cutting) #Engine ที่เหมาะกับงานเราที่สุด


text_tag_list = pos_tag(text_collate)
print(text_tag_list)

text_pos = []
pos_vact = []
pos = ""

for itr in range(len(text_tag_list)):
    text_pos.append([text_tag_list[itr][0],text_tag_list[itr][1]])

         
    if text_pos[itr][1] == None:
        print("Word : "+text_pos[itr][0])
        pos = input()
        print("POS : "+pos)
        text_pos[itr][1] = pos

Ejemplo n.º 29

0

Mostrar archivo

Archivo: prepare_data.py Proyecto: noratap09/QA_question_type_2

        txt = [x.lower() for x in txt]

        #print("TXT : ",txt)

        ck_answer = list()
        answer = data['answer']

        print("t ->", math.ceil(len(txt) / slide_size))
        for i in range(0, math.ceil(len(txt) / slide_size)):
            pre_data = np.zeros((input_text_len, Word2Vec_len + pos_len),
                                dtype=np.float32)
            pre_ans = np.zeros((num_class), dtype=np.float32)
            input_text = txt[i * slide_size:i * slide_size + input_text_len -
                             1]

            result_pos = pos_tag(input_text)

            print("Tag : ", i)
            print("CK_ANS : ", answer)
            print("QUESTION : ", question)
            print("INPUT TXT : ", "".join(input_text))
            #print(len(txt)," > ",i,":",i*slide_size,"-",i*slide_size+input_text_len)
            #get input feature

            for n_j, j in enumerate(input_text, start=0):
                if (j in model.wv.vocab.keys()):
                    pre_data[n_j, 0:Word2Vec_len] = (model.wv.get_vector(j) +
                                                     2.6491606) / (2.6491606 +
                                                                   2.6473184)
                pre_data[n_j, Word2Vec_len +
                         (pos_all.index(result_pos[n_j][1]))] = 1.0

Ejemplo n.º 30

0

Mostrar archivo


text_file = open("corpus/665.txt", encoding="utf8")
data = [text_file.read()]

data[0] = cleanhtml(data[0])
print("Raw  :", data)

tokendata = [list(word_tokenize(i, engine='newmm')) for i in data]
print("Tokenized    :", tokendata)
# model = Word2Vec(tokendata,size=3, min_count=1,window=5,sg=1)
# model.save('665txt-3d.bin')
modelW2V = Word2Vec.load("665txt-3d.bin")
words = list(modelW2V.wv.vocab)
print("Words    :", words)
dataTag = pos_tag(tokendata[0], engine='old')
print(dataTag)

nouns = []
verbs = []
adverbs = []
prepo = []

for _, x in dataTag:

    if (x == 'NCMN' and _.__len__() > 1 and _ not in nouns
            and nouns.__len__() < 20):
        nouns.append((_, 0))

    if (x == 'VACT' and _.__len__() > 1 and _ not in verbs
            and verbs.__len__() < 20):

Ejemplo n.º 31

0

Mostrar archivo

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])
        self.assertEqual(
            pos_tag(["นักเรียน", "ถาม", "ครู"]),
            [("นักเรียน", "NCMN"), ("ถาม", "VACT"), ("ครู", "NCMN")],
        )
        self.assertEqual(
            len(pos_tag(["การ", "เดินทาง", "มี", "ความ", "ท้าทาย"])), 5)

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])
        self.assertIsNotNone(pos_tag(tokens, engine="unigram",
                                     corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertTrue(
            pos_tag(["การ", "รัฐประหาร"], corpus="orchid_ud")[0][1], "NOUN")
        self.assertTrue(
            pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN")

        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid_ud"), [])
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(
            pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron",
                                     corpus="pud"))

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )

Ejemplo n.º 32

0

Mostrar archivo

def Pos(data):
    pos = pos_tag(data, corpus='orchid_ud')
    return pos

Ejemplo n.º 33

0

Mostrar archivo

Archivo: named_entity.py Proyecto: pacharapol4066/Product-Position-SNA

    def get_ner(
        self,
        text: str,
        pos: bool = True,
        tag: bool = False
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
        """
        This function tags named-entitiy from text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: To include POS tags in the results (`True`) or
                            exclude (`False`). The defualt value is `True`
        :param bool tag: output like html tag.
        :return: a list of tuple associated with tokenized word, NER tag,
                 POS tag (if the parameter `pos` is specified as `True`),
                 and output like html tag (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuple associated with tokenized
                 word and NER tag
        :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

        :Note:
            * For the POS tags to be included in the results, this function
              uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron`
              and corpus as orchid_ud`.

        :Example:

            >>> from pythainlp.tag.named_entity import ThaiNameTagger
            >>>
            >>> ner = ThaiNameTagger()
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
            [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'),
            ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'),
            ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'),
            ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'),
            ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'),
            ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'),
            ('น.', 'NOUN', 'I-TIME')]
            >>>
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            pos=False)
            [('วันที่', 'O'), (' ', 'O'),
            ('15', 'B-DATE'), (' ', 'I-DATE'),
            ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
            ('61', 'I-DATE'), (' ', 'O'),
            ('ทดสอบ', 'O'), ('ระบบ', 'O'),
            ('เวลา', 'O'), (' ', 'O'),
            ('14', 'B-TIME'), (':', 'I-TIME'),
            ('49', 'I-TIME'), (' ', 'I-TIME'),
            ('น.', 'I-TIME')]
            >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                            tag=True)
            'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
        """
        #tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE, keep_whitespace=False)
        tokens = _tokenizer.word_tokenize(text)
        pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
        x_test = ThaiNameTagger.__extract_features(pos_tags)
        y = self.crf.tag(x_test)

        sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]

        if tag:
            temp = ""
            sent = ""
            for idx, (word, ner) in enumerate(sent_ner):
                if ner.startswith("B-") and temp != "":
                    sent += "</" + temp + ">"
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner.startswith("B-"):
                    temp = ner[2:]
                    sent += "<" + temp + ">"
                elif ner == "O" and temp != "":
                    sent += "</" + temp + ">"
                    temp = ""
                sent += word

                if idx == len(sent_ner) - 1 and temp != "":
                    sent += "</" + temp + ">"

            return sent

        if pos:
            return [(pos_tags[i][0], pos_tags[i][1], data)
                    for i, data in enumerate(y)]

        return sent_ner

Ejemplo n.º 34

0

Mostrar archivo

Archivo: __init__.py Proyecto: wannaphongcom/pythainlp

    def test_pos_tag(self):
        tokens = ["ผม", "รัก", "คุณ"]

        self.assertEqual(pos_tag(None), [])
        self.assertEqual(pos_tag([]), [])

        self.assertEqual(unigram.tag(None, corpus="pud"), [])
        self.assertEqual(unigram.tag([], corpus="pud"), [])
        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
        self.assertEqual(unigram.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid"))
        self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud"))
        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
        self.assertEqual(perceptron.tag([], corpus="pud"), [])
        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
        self.assertEqual(perceptron.tag([], corpus="orchid"), [])

        self.assertIsNotNone(pos_tag(None, engine="artagger"))
        self.assertIsNotNone(pos_tag([], engine="artagger"))
        self.assertIsNotNone(pos_tag(tokens, engine="artagger"))
        self.assertEqual(
            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"),
            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
        )

        self.assertEqual(pos_tag_sents(None), [])
        self.assertEqual(pos_tag_sents([]), [])
        self.assertEqual(
            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
            [
                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
            ],
        )