Esempio n. 1
0
from sinling.sinhala.tokenizer import SinhalaTweetTokenizer

#encoding. word encodig
import codecs

#save to disk
import pickle

f_1 = codecs.open("Preprocessed/test_1.txt", encoding='utf-8', errors='ignore')
f_2 = codecs.open("Preprocessed/test_2.txt", encoding='utf-8', errors='ignore')
f_3 = codecs.open("Preprocessed/test_3.txt", encoding='utf-8', errors='ignore')
f_4 = codecs.open("Preprocessed/test_4.txt", encoding='utf-8', errors='ignore')
f_5 = codecs.open("Preprocessed/test_5.txt", encoding='utf-8', errors='ignore')
f_6 = codecs.open("Preprocessed/test_6.txt", encoding='utf-8', errors='ignore')

tokenizer = SinhalaTweetTokenizer()

sentences = []
for line in f_1:
    line = line.rstrip()
    sentences.append(tokenizer.tokenize(line))
print("Finished File 1")

for line in f_2:
    line = line.rstrip()
    sentences.append(tokenizer.tokenize(line))
print("Finished File 2")

for line in f_3:
    line = line.rstrip()
    sentences.append(tokenizer.tokenize(line))
from sinling.sinhala.tokenizer import SinhalaTweetTokenizer
import codecs

f_1 = codecs.open("Preprocessed/test_1.txt", encoding='utf-8', errors='ignore')
f_2 = codecs.open("Preprocessed/test_2.txt", encoding='utf-8', errors='ignore')
f_3 = codecs.open("Preprocessed/test_3.txt", encoding='utf-8', errors='ignore')
f_4 = codecs.open("Preprocessed/test_4.txt", encoding='utf-8', errors='ignore')
f_5 = codecs.open("Preprocessed/test_5.txt", encoding='utf-8', errors='ignore')
f_6 = codecs.open("Preprocessed/test_7.txt", encoding='utf-8', errors='ignore')

tokenizer = SinhalaTweetTokenizer()

sentences = []
# for line in f_1:
#     line = line.rstrip()
#     sentences.append(tokenizer.tokenize(line))
#
# print("Finished File 1")
#
# for line in f_2:
#     line = line.rstrip()
#     sentences.append(tokenizer.tokenize(line))
#
# print("Finished File 2")
#
# for line in f_3:
#     line = line.rstrip()
#     sentences.append(tokenizer.tokenize(line))
#
# print("Finished File 3")
#
Esempio n. 3
0
from sinling.sinhala.tokenizer import SinhalaTweetTokenizer

import codecs

# First Step. You need to remove special characters and garbage characters from the corpus.
# Used a Sinhala tokenizer available in https://github.com/ysenarath/sinling with several modifications. See in singling folder

if __name__ == '__main__':
    f = codecs.open("D:/NLP/Corpus/wikipedia.si_filtered",
                    encoding='utf-8',
                    errors='ignore')  # open source file
    f_w = codecs.open("wikipedia.si_filtered_tokenized.txt", 'w',
                      'utf-8')  # write to this file after processing

    tokenizer = SinhalaTweetTokenizer()
    docs = []
    i = 0
    for line in f:
        docs.append(line)

    for doc in docs:
        for sent in tokenizer.split_sentences(doc):
            tokens = tokenizer.tokenize(sent)
            line = " ".join(tokens)
            line = line + "\n"
            f_w.write(line)

f.close()
f_w.close()