Example #1
0
def main():

    # Open all files related to removing stop words or punctuation from the data.
    sw_in = open(r"../data/arstoplist.txt")
    stopwords = sw_in.read().splitlines()
    punctlist = open("../data/arabpunct.txt").read().splitlines()

    directory = sys.argv[1]

    # Give location of input files.
    files = os.listdir("../out/" + directory + "/ar/")

    st = ISRIStemmer()
    #rx_en = re.compile(r'\D+')
    tokens = []
    counter = 0
    filelist = []

    for f in files:
        if "txt" in f:
            counter += 1
            f_in = open("../out/" + directory + "/ar/" + f, 'rU')
            lines = f_in.readlines()
            f_in.close()
            filelist.extend(lines)

    print("Files read.")

    stemmed = {}
    types = {}

    f_out = open('../out/testset-tokenized-' + directory + '.txt', 'w')
    compl_list = []

    for line in filelist:
        #line = line.strip()
        #tokenize = word_tokenize(line)

        # Tokenize the text.
        tokenize = tokenizer(line)

        #tokenize.sort() # Comment this out after the test-set has been used?

        # Define all patterns that shall be excluded.
        rx_ar = re.compile(
            u'^[\u0621-\u064A]+$'
        )  # This exludes Arabic words that have numbers attached to them.
        rx_ar2 = re.compile(u'^(\u0622{2,})')

        for w in tokenize:
            if len(w) == 1:
                pass
            elif rx_ar2.match(w):
                pass
            elif rx_ar.match(w):
                f_out.write(w + "\n")
                compl_list.append(w)
            else:
                pass
    f_out.close()

    # wieder einfügen

    for w in compl_list:
        types[w] = 0
        #if punctlist[0] in compl_list or punctlist[1] in compl_list or punctlist[2] or punctlist[3] in compl_list:
        #    if len(w) > 1: # ERROR
        #        new_w = w[:-1] # ERROR! This strips off Arabic letters although they are not in the punctlist
        #        types[new_w] = 0
        #        tokens.append(new_w)
        #    else:
        #        types[w] = 0
        #        tokens.append(w)

    print(str(len(types)) + " different words.")
    print("Punctuation separated.")

    # Here the actual stemming happens.
    verbs = {}
    c = -1
    for w in types:
        c += 1
        if w not in stopwords:
            stm = st.stem(w)
            stemmed[w] = stm
            verbs[stm] = 0
        if c % 10000 == 0:
            print(str(c) + " words stemmed.")
    print("File stemmed.")

    # print the stemmed words and their unstemmed versions to a file
    f_out = open('../out/stem_tok_' + directory + '.txt', 'w')
    wordlist = []
    for w in verbs.keys():
        if len(
                w
        ) > 4:  # Don't save words that are longer than 4 letters. Verbs in Arabic are usually 3 letters long. Ivery rare cases they can be 2 or 4 letters long as well.
            pass
        else:
            wordlist.append(w)
            #f_out.write(w + "\t" + stemmed[w])
            #f_out.write(w + "\n")
    wordlist.sort()
    for w in wordlist:
        f_out.write(w + "\n")
    f_out.write("No. of verbs:" +
                str(len(wordlist)))  # Really verbs? Why not wordlist?
    f_out.close()

    # handle some corpora stats
    corp_stat = Counter(tokens)
    for w in list(corp_stat.keys())[0:11]:
        print("token: " + w + "\tno.: " + str(corp_stat[w]))
Example #2
0
import pickle
# load the dataset
data = open('ManualAnnotatedFakeNewsDataset.txt').read()
#data = open('AutomaticAnnotatedFakeNewsDataset.txt').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    labels.append(content[0])
    texts.append(" ".join(content[1:]))
#stemming
data1 = []
from nltk import word_tokenize

from nltk.stem.isri import ISRIStemmer

st = ISRIStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        tweet = tweet + st.stem(a) + " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:
Example #3
0
 def stemmimg_text(self, text):
     st = ISRIStemmer()
     return [st.stem(w) for w in text]
        for doc in docs:
            # print (doc)
            for line in doc['content']:
                text = re.sub(
                    r'[\d+ a-zA-Z? & , \xd8 « » . :"،]', ' ', line
                )  # remove non-alphabetical characters and non-arabic characters
                tkns = text.split()
                tokenss = []
                for token in tkns:
                    tokenss.append(token)
                tokens.append(tokenss)  # produces list of lists of tokens
    cleaned_data = [item for item in tokens if item != []]
    return cleaned_data


stemmer = ISRIStemmer()
data = clean_data()  # this is a list of lists of tokens


def lemmatizer(token):
    #print ("Data lemmatized")
    token = stemmer.pre32(
        token)  # removes the three-letter and two-letter prefixes
    token = stemmer.suf32(
        token)  # removes the three-letter and two-letter suffixes
    token = stemmer.norm(token, num=1)  # removes diacritics
    return token


def stop_words():
    stop_words = stopwords.words('arabic')
 def __init__(self):
     self.stemmer = ISRIStemmer()
     self.stopWordsIndex = ArabicStopWordsIndex(self)
     self.stopWordsIndex.buildIndex()
Example #6
0
import string

#
from nltk.stem.isri import ISRIStemmer

isri = ISRIStemmer()

text = "على قيادة المؤتمر الشعبي العام قراءة رسالة الشعب جيدا من خلال احتشاد ميدان السبعين ، والتي تعني تحمل مسؤليته"

words = text.split()

new_words = []

for word in words:
    #stem word
    new_word = isri.stem(word)
    #print("."+new_word+".")

    #dont append if stemming turns it into whitespace/""
    if new_word != "":
        new_words.append(new_word)

#return this
new_text = ' '.join(new_words)

print(new_text)
 def __init__(self):
     self.st = ISRIStemmer()
Example #8
0
def _getstem(_word):
    st = ISRIStemmer()
    return st.stem(_word)
from nltk.stem import RSLPStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import tinysegmenter
import traceback
#from analyzer.kg_export.language.kazlemmatizer import kazakh_lemma_tokenizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

use_compound_split_german = False
if use_compound_split_german:
    import LanguageDetection

stem_ar = ISRIStemmer()
factory = StemmerFactory()
sastrawi_stemmer = factory.create_stemmer()  #arabic stemmer
stem_pt = RSLPStemmer()  #portugese_brazalian stemmer
stem_ja = tinysegmenter.TinySegmenter()
stem_nl = SnowballStemmer('dutch')
stem_ru = SnowballStemmer('russian')
stem_sv = SnowballStemmer('swedish')
stem_fr = SnowballStemmer('french')
stem_de = SnowballStemmer('german')


def read_file(filename):
    try:
        with open(filename, "r") as file_dp:
            data = json.load(file_dp)