def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e)) from e
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e))
def load_stanford_segmenter(): try: seg = StanfordSegmenter() seg.default_config("ar") seg.default_config("zh") return True except LookupError: return False
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ seg = StanfordSegmenter() seg.default_config("zh") sent = "这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: pytest.skip(str(e))
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: raise SkipTest(str(e))
def setup_module(module): import pytest try: seg = StanfordSegmenter() seg.default_config("ar") seg.default_config("zh") except LookupError as e: pytest.skip("Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)) try: StanfordTokenizer() except LookupError: pytest.skip( "Tests for nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist" )
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ seg = StanfordSegmenter() seg.default_config("ar") sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ "يبحث", "علم", "الحاسوب", "استخدام", "الحوسبة", "ب", "جميع", "اشكال", "ها", "ل", "حل", "المشكلات", ]
from polyglot.text import Text from rake_nltk import Rake import nltk nltk.download('stopwords') from nltk.corpus import stopwords os.environ[ 'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/' os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17' os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1' segmenter = StanfordSegmenter( 'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar') segmenter.default_config('ar') text = segmenter.segment_file('sample.txt') print(text) tagger = STag.StanfordPOSTagger( 'arabic.tagger', 'stanford-postagger-full-2018-10-16/stanford-postagger.jar') for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.')) for line in sentences: for sentence in line: print(sentence)