Esempio n. 1
0
 def tag(text1):
     try:
         from artagger import Tagger
     except ImportError:
         from pythainlp.tools import install_package
         install_package(
             'https://github.com/wannaphongcom/artagger/archive/master.zip'
         )
         try:
             from artagger import Tagger
         except ImportError:
             print(
                 "Error ! using 'pip install https://github.com/wannaphongcom/artagger/archive/master.zip'"
             )
             sys.exit(0)
     words = Tagger().tag(' '.join(text1))
     totag = []
     for word in words:
         totag.append((word.word, word.tag))
     return totag
Esempio n. 2
0
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
'''
from __future__ import absolute_import,unicode_literals
import os
import sys
import re
import torch

#numpy and fastai
try:
    import numpy as np
    from fastai.text import *
    import dill as pickle
except ImportError:
    from pythainlp.tools import install_package
    install_package('fastai')
    install_package('numpy')
    try:
        import numpy as np
        from fastai.text import *
        import dill as pickle
    except ImportError:
        print("Error installing using 'pip install fastai numpy dill'")
        sys.exit(0)

#import torch
try:
    import torch
except ImportError:
    print('PyTorch required. See https://pytorch.org/.')
Esempio n. 3
0
# -*- coding: utf-8 -*-
'''
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
'''
from __future__ import absolute_import,unicode_literals
import os
import sys
from collections import defaultdict

#numpy and dill
try:
    import numpy as np
    import dill as pickle
except ImportError:
    from pythainlp.tools import install_package
    install_package('numpy')
    install_package('dill')
    try:
        import numpy as np
        import dill as pickle
    except ImportError:
        print("Error installing using 'pip install numpy dill'")
        sys.exit(0)

#import torch
try:
    import torch
except ImportError:
    print('PyTorch required. See https://pytorch.org/.')
import torch
from torch.autograd import Variable
Esempio n. 4
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
try:
    import deepcut
except ImportError:
    '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ'''
    from pythainlp.tools import install_package
    install_package("deepcut")
    try:
        import deepcut
    except ImportError:
        raise Exception("ImportError ! using pip install deepcut")


def segment(text):
    return deepcut.tokenize(text)
Esempio n. 5
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,print_function,unicode_literals
import sys
import re
try:
	import icu
except ImportError:
	from pythainlp.tools import install_package
	install_package('pyicu')
	try:
		import icu
	except ImportError:
		sys.exit('Error ! using pip install pyicu')
def gen_words(text):
  bd = icu.BreakIterator.createWordInstance(icu.Locale("th"))
  bd.setText(text)
  p = bd.first()
  for q in bd:
    yield text[p:q]
    p = q

def segment(text):
  text = re.sub("([^\u0E00-\u0E7F\n ]+)"," \\1 ",text)
  return list(gen_words(text))
if __name__ == "__main__":
	print(segment('ทดสอบระบบตัดคำด้วยไอซียู'))
	print(segment('ผมชอบพูดไทยคำ English'))
	print(segment('ผมชอบพูดไทยคำEnglishคำ'))
	print(segment("""ผมชอบพูดไทยคำEnglish540
    บาท"""))
	print(segment('ประหยัด ไฟเบอห้า'))
Esempio n. 6
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals, print_function
import sys
import six
if six.PY2:
    print("Thai sentiment in pythainlp. Not support Python 2")
    sys.exit(0)
try:
    from wordcut import Wordcut
except ImportError:
    '''
    ในกรณีที่ยังไม่ติดตั้ง wordcutpy ในระบบ
    '''
    from pythainlp.tools import install_package
    install_package('wordcutpy')
    try:
        from wordcut import Wordcut
    except ImportError:
        sys.exit('Error ! using $ pip install wordcutpy')


def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 7
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
try:
    import icu
except ImportError:
    from pythainlp.tools import install_package
    install_package('pyicu')
    try:
        import icu
    except ImportError:
        sys.exit('Error ! using pip install pyicu')


# ถอดเสียงภาษาไทยเป็น Latin
def romanization(data):
    """เป็นคำสั่ง ถอดเสียงภาษาไทยเป็น Latin รับค่า ''str'' ข้อความ คืนค่าเป็น ''str'' ข้อความ Latin"""
    thai2latin = icu.Transliterator.createInstance('Thai-Latin')
    return thai2latin.transliterate(data)
Esempio n. 8
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals,print_function
import sys
import six
if six.PY2:
	print("Thai sentiment in pythainlp. Not support Python 2")
	sys.exit(0)
try:
    from wordcut import Wordcut
except ImportError:
	'''
    ในกรณีที่ยังไม่ติดตั้ง wordcutpy ในระบบ
    '''
	from pythainlp.tools import install_package
	install_package('wordcutpy')
	try:
		from wordcut import Wordcut
	except ImportError:
		sys.exit('Error ! using $ pip install wordcutpy')

def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 9
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
# NLP
import re
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from pythainlp.corpus import stopwords
thaicut="newmm" # ตัวตัดคำ
# CRF
try:
    import sklearn_crfsuite
except ImportError:
    from pythainlp.tools import install_package
    install_package('sklearn-crfsuite')
    import sklearn_crfsuite
# FILE
import glob
import codecs
from pythainlp.corpus import get_file,download

stopwords = stopwords.words('thai')


def isThai(chr): # เช็คว่าเป็น char ภาษาไทย
 cVal = ord(chr)
 if(cVal >= 3584 and cVal <= 3711):
  return True
 return False
def isThaiWord(word): # เช็คว่าเป็นคำภาษาไทย
 t=True
 for i in word:
Esempio n. 10
0
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
'''
from __future__ import absolute_import, unicode_literals
import os
import sys
import re
import torch

#numpy and fastai
try:
    import numpy as np
    from fastai.text import *
    import dill as pickle
except ImportError:
    from pythainlp.tools import install_package
    install_package('fastai')
    install_package('numpy')
    try:
        import numpy as np
        from fastai.text import *
        import dill as pickle
    except ImportError:
        print("Error installing using 'pip install fastai numpy dill'")
        sys.exit(0)

#import torch
try:
    import torch
except ImportError:
    print('PyTorch required. See https://pytorch.org/.')
Esempio n. 11
0
# -*- coding: utf-8 -*-
from __future__ import print_function

try:
    import numpy as np
    import keras
except ImportError:
    from pythainlp.tools import install_package
    install_package('keras')
    install_package('numpy')

from pythainlp.corpus import get_file,download

from keras.models import Model, load_model
from keras.layers import Input
import numpy as np
class thai2rom:
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path==None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')
        self.input_texts = []
        self.target_texts = []
Esempio n. 12
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
import sys
try:
    import deepcut
except ImportError:
	'''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ'''
	from pythainlp.tools import install_package
	install_package('deepcut')
	try:
		import deepcut
	except ImportError:
		sys.exit('Error ! using pip install deepcut')
def segment(text):
    return deepcut.tokenize(text)
Esempio n. 13
0
# -*- coding: utf-8 -*-
'''
Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
'''
from __future__ import absolute_import, unicode_literals
import six
import sys
if six.PY2:
    print("Thai sentiment in pythainlp. Not support python 2.7")
    sys.exit(0)
try:
    from gensim.models import KeyedVectors
    import numpy as np
except ImportError:
    from pythainlp.tools import install_package
    install_package('gensim')
    install_package('numpy')
    try:
        from gensim.models import KeyedVectors
        import numpy as np
    except ImportError:
        print("Error ! using 'pip install gensim numpy'")
        sys.exit(0)
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import get_file
from pythainlp.corpus import download as download_data
import os


def download():
    path = get_file('thai2vec02')
Esempio n. 14
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
try:
    import deepcut
except ImportError:
    '''ในกรณีที่ยังไม่ติดตั้ง deepcut ในระบบ'''
    from pythainlp.tools import install_package
    install_package('deepcut')
    try:
        import deepcut
    except ImportError:
        sys.exit('Error ! using pip install deepcut')


def segment(text):
    return deepcut.tokenize(text)
Esempio n. 15
0
import re
import sys

from pythainlp.corpus import download, get_file
from pythainlp.tokenize import word_tokenize


# numpy and fastai
try:
    import numpy as np
    from fastai.text import *
    import dill as pickle
except ImportError:
    from pythainlp.tools import install_package

    install_package("fastai")
    install_package("numpy")
    try:
        import numpy as np
        from fastai.text import *
        import dill as pickle
    except ImportError:
        print("Error installing using 'pip install fastai numpy dill'")
        sys.exit(0)

# import torch
try:
    import torch
except ImportError:
    print("PyTorch required. See https://pytorch.org/.")
Esempio n. 16
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
import sys
try:
    from pylexto import LexTo
except ImportError:
	from pythainlp.tools import install_package
	install_package('https://github.com/wannaphongcom/pylexto/archive/master.zip')
	try:
		from pylexto import LexTo
	except ImportError:
		sys.exit('Error ! using pip install https://github.com/wannaphongcom/pylexto/archive/master.zip')
def segment(text,full=False):
    lexto = LexTo()
    words, types = lexto.tokenize(text)
    if full==True:
        return (words,types)
    else:
        return words
Esempio n. 17
0
# -*- coding: utf-8 -*-
'''
Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
'''
from __future__ import absolute_import,unicode_literals
import six
import sys
if six.PY2:
	print("Thai sentiment in pythainlp. Not support python 2.7")
	sys.exit(0)
try:
	from gensim.models import KeyedVectors
	import numpy as np
except ImportError:
	from pythainlp.tools import install_package
	install_package('gensim')
	install_package('numpy')
	try:
		from gensim.models import KeyedVectors
		import numpy as np
	except ImportError:
		print("Error ! using 'pip install gensim numpy'")
		sys.exit(0)
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import get_file
from pythainlp.corpus import download as download_data
import os

def download():
	path = get_file('thai2vec02')
	if path==None:
Esempio n. 18
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
try:
    from pylexto import LexTo
except ImportError:
    from pythainlp.tools import install_package
    install_package(
        'https://github.com/wannaphongcom/pylexto/archive/master.zip')
    try:
        from pylexto import LexTo
    except ImportError:
        sys.exit(
            'Error ! using pip install https://github.com/wannaphongcom/pylexto/archive/master.zip'
        )


def segment(text, full=False):
    lexto = LexTo()
    words, types = lexto.tokenize(text)
    if full == True:
        return (words, types)
    else:
        return words
Esempio n. 19
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals, print_function
import sys
import six
try:
    from wordcut import Wordcut
except ImportError:
    '''
    ในกรณีที่ยังไม่ติดตั้ง wordcutpy ในระบบ
    '''
    from pythainlp.tools import install_package
    install_package("wordcutpy")
    try:
        from wordcut import Wordcut
    except ImportError:
        raise Exception("ImportError ! using $ pip install wordcutpy")


def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 20
0
# -*- coding: utf-8 -*-
'''
Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
'''
from __future__ import absolute_import, unicode_literals
import os
import sys
from collections import defaultdict

#numpy and dill
try:
    import numpy as np
    import dill as pickle
except ImportError:
    from pythainlp.tools import install_package
    install_package('numpy')
    install_package('dill')
    try:
        import numpy as np
        import dill as pickle
    except ImportError:
        print("Error installing using 'pip install numpy dill'")
        sys.exit(0)

#import torch
try:
    import torch
except ImportError:
    print('PyTorch required. See https://pytorch.org/.')
import torch
from torch.autograd import Variable
Esempio n. 21
0
# -*- coding: utf-8 -*-
from __future__ import print_function

try:
    import numpy as np
    import keras
except ImportError:
    from pythainlp.tools import install_package
    install_package('keras')
    install_package('numpy')

from pythainlp.corpus import get_file, download

from keras.models import Model, load_model
from keras.layers import Input
import numpy as np


class thai2rom:
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path == None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')