Python ISRIStemmer.ISRIStemmer Examples

Programming Language: Python

Namespace/Package Name: nltk.stem.isri

Class/Type: ISRIStemmer

Method/Function: ISRIStemmer

Examples at hotexamples.com: 9

Python ISRIStemmer.ISRIStemmer - 9 examples found. These are the top rated real world Python examples of nltk.stem.isri.ISRIStemmer.ISRIStemmer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

stem(30)

ISRIStemmer(9)

norm(9)

pre32(6)

suf32(6)

__init__(3)

waw(3)

pre1(1)

suf1(1)

Example #1

Show file

File: stem.py Project: pmsprenger/ma

def main():

    # Open all files related to removing stop words or punctuation from the data.
    sw_in = open(r"../data/arstoplist.txt")
    stopwords = sw_in.read().splitlines()
    punctlist = open("../data/arabpunct.txt").read().splitlines()

    directory = sys.argv[1]

    # Give location of input files.
    files = os.listdir("../out/" + directory + "/ar/")

    st = ISRIStemmer()
    #rx_en = re.compile(r'\D+')
    tokens = []
    counter = 0
    filelist = []

    for f in files:
        if "txt" in f:
            counter += 1
            f_in = open("../out/" + directory + "/ar/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()
            filelist.extend(lines)

    print("Files read.")

    stemmed = {}
    types = {}

    f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w')
    compl_list = []

    for line in filelist:
        #line = line.strip()
        #tokenize = word_tokenize(line)

        # Tokenize the text.
        tokenize = tokenizer(line)

        #tokenize.sort() # Comment this out after the test-set has been used?

        # Define all patterns that shall be excluded.
        rx_ar = re.compile(
            u'^[\u0621-\u064A]+$'
        )  # This exludes Arabic words that have numbers attached to them.
        rx_ar2 = re.compile(u'^(\u0622{2,})')

        for w in tokenize:
            if len(w) == 1:
                pass
            elif rx_ar2.match(w):
                pass
            elif rx_ar.match(w):
                f_out.write(w + "\n")
                compl_list.append(w)
            else:
                pass
    f_out.close()

    # wieder einfügen

    for w in compl_list:
        types[w] = 0
        #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list:
        #    if len(w) > 1: # ERROR
        #        new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist
        #        types[new_w] = 0
        #        tokens.append(new_w)
        #    else:
        #        types[w] = 0
        #        tokens.append(w)

    print(str(len(types)) + " different words.")
    print("Punctuation separated.")

    # Here the actual stemming happens.
    verbs = {}
    c = -1
    for w in types:
        c += 1
        if w not in stopwords:
            stm = st.stem(w)
            stemmed[w] = stm
            verbs[stm] = 0
        if c % 10000 == 0:
            print(str(c) + " words stemmed.")
    print("File stemmed.")

    # print the stemmed words and their unstemmed versions to a file
    f_out = open('../out/stem_tok_' + directory + '.txt', 'w')
    wordlist = []
    for w in verbs.keys():
        if len(
                w
        ) > 4:  # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well.
            pass
        else:
            wordlist.append(w)
            #f_out.write(w + "\t" + stemmed[w])
            #f_out.write(w + "\n")
    wordlist.sort()
    for w in wordlist:
        f_out.write(w + "\n")
    f_out.write("No. of verbs:" +
                str(len(wordlist)))  # Really verbs? Why not wordlist?
    f_out.close()

    # handle some corpora stats
    corp_stat = Counter(tokens)
    for w in list(corp_stat.keys())[0:11]:
        print("token: " + w + "\tno.: " + str(corp_stat[w]))

Example #2

Show file

File: LR.py Project: yemen2016/FakeNewsDetection

import pickle
# load the dataset
data = open('ManualAnnotatedFakeNewsDataset.txt').read()
#data = open('AutomaticAnnotatedFakeNewsDataset.txt').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    labels.append(content[0])
    texts.append(" ".join(content[1:]))
#stemming
data1 = []
from nltk import word_tokenize

from nltk.stem.isri import ISRIStemmer

st = ISRIStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        tweet = tweet + st.stem(a) + " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:

Example #3

Show file

File: nlp.py Project: Abdellah1997/nlp-project-backend

 def stemmimg_text(self, text):
     st = ISRIStemmer()
     return [st.stem(w) for w in text]

Example #4

Show file

File: topicmodel.py Project: elsayed-issa/Arabic-News-Summaries

        for doc in docs:
            # print (doc)
            for line in doc['content']:
                text = re.sub(
                    r'[\d+ a-zA-Z? & , \xd8 « » . :"،]', ' ', line
                )  # remove non-alphabetical characters and non-arabic characters
                tkns = text.split()
                tokenss = []
                for token in tkns:
                    tokenss.append(token)
                tokens.append(tokenss)  # produces list of lists of tokens
    cleaned_data = [item for item in tokens if item != []]
    return cleaned_data


stemmer = ISRIStemmer()
data = clean_data()  # this is a list of lists of tokens


def lemmatizer(token):
    #print ("Data lemmatized")
    token = stemmer.pre32(
        token)  # removes the three-letter and two-letter prefixes
    token = stemmer.suf32(
        token)  # removes the three-letter and two-letter suffixes
    token = stemmer.norm(token, num=1)  # removes diacritics
    return token


def stop_words():
    stop_words = stopwords.words('arabic')

Example #5

Show file

File: stemer.py Project: Hassaine/Named-Entity-Extraction

 def __init__(self):
     self.stemmer = ISRIStemmer()
     self.stopWordsIndex = ArabicStopWordsIndex(self)
     self.stopWordsIndex.buildIndex()

Example #6

Show file

import string

#
from nltk.stem.isri import ISRIStemmer

isri = ISRIStemmer()

text = "على قيادة المؤتمر الشعبي العام قراءة رسالة الشعب جيدا من خلال احتشاد ميدان السبعين ، والتي تعني تحمل مسؤليته"

words = text.split()

new_words = []

for word in words:
    #stem word
    new_word = isri.stem(word)
    #print("."+new_word+".")

    #dont append if stemming turns it into whitespace/""
    if new_word != "":
        new_words.append(new_word)

#return this
new_text = ' '.join(new_words)

print(new_text)

Example #7

Show file

File: stemmer.py Project: nour-ramadan/Smart-Search-Plugin

 def __init__(self):
     self.st = ISRIStemmer()

Example #8

Show file

def _getstem(_word):
    st = ISRIStemmer()
    return st.stem(_word)

Example #9

Show file

File: Lemmatize.py Project: Koredotcom/KnowledgeGraphGenerator

from nltk.stem import RSLPStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tinysegmenter
import traceback
#from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

use_compound_split_german = False
if use_compound_split_german:
    import LanguageDetection

stem_ar = ISRIStemmer()
factory = StemmerFactory()
sastrawi_stemmer = factory.create_stemmer()  #arabic stemmer
stem_pt = RSLPStemmer()  #portugese_brazalian stemmer
stem_ja = tinysegmenter.TinySegmenter()
stem_nl = SnowballStemmer('dutch')
stem_ru = SnowballStemmer('russian')
stem_sv = SnowballStemmer('swedish')
stem_fr = SnowballStemmer('french')
stem_de = SnowballStemmer('german')


def read_file(filename):
    try:
        with open(filename, "r") as file_dp:
            data = json.load(file_dp)