Esempio n. 1
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e)) from e
Esempio n. 2
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e))
def load_stanford_segmenter():
    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
        return True
    except LookupError:
        return False
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("zh")
     sent = "这是斯坦福中文分词器测试"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
Esempio n. 5
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         pytest.skip(str(e))
Esempio n. 6
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         raise SkipTest(str(e))
Esempio n. 7
0
def setup_module(module):
    import pytest

    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
    except LookupError as e:
        pytest.skip("Tests for nltk.tokenize.stanford_segmenter skipped: %s" %
                    str(e))

    try:
        StanfordTokenizer()
    except LookupError:
        pytest.skip(
            "Tests for nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist"
        )
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("ar")
     sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == [
         "يبحث",
         "علم",
         "الحاسوب",
         "استخدام",
         "الحوسبة",
         "ب",
         "جميع",
         "اشكال",
         "ها",
         "ل",
         "حل",
         "المشكلات",
     ]
Esempio n. 9
0
from polyglot.text import Text
from rake_nltk import Rake

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

os.environ[
    'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/'
os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17'
os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17'
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1'

segmenter = StanfordSegmenter(
    'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar')
segmenter.default_config('ar')
text = segmenter.segment_file('sample.txt')
print(text)

tagger = STag.StanfordPOSTagger(
    'arabic.tagger',
    'stanford-postagger-full-2018-10-16/stanford-postagger.jar')
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
for line in sentences:
    for sentence in line:
        print(sentence)