def test_sentence_tokenizer_marathi(self): """Test tokenizing marathi sentences.""" text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥" target = ['अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये', 'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं', 'तेषां', 'के', 'योगवित्तमाः', '॥'] tokenizer = TokenizeSentence('marathi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing sanskrit sentences.""" text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।" target = ['श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ', 'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च', '।', '।'] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_classical_hindi(self): """Test tokenizing classical_hindi sentences.""" text = "जलर् चिकित्सा से उन्हें कोई लाभ नहीं हुआ।" target = ['जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।'] tokenizer = TokenizeSentence('hindi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def cleaning_data(str): tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(str) # print(bengali_text_tokenize) cleaned = clean(bengali_text_tokenize) cleaned = ' '.join(cleaned) return cleaned
def test_sentence_tokenizer_telugu(self): """Test tokenizing telugu sentences.""" text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు." target = ['తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను', 'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన', 'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు', 'ధనికుని', 'యింటికేతెంచుచుందురు', '.'] tokenizer = TokenizeSentence('telugu') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_bengali(self): """Test tokenizing bengali sentences.""" text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।" target = ['দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে', 'বেশ', 'সুখে', 'আছেন', '।'] tokenizer = TokenizeSentence('bengali') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def porter_tokenizer(text): """ A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) and applies the porter stemming algorithm to each of the obtained token. Tokens that are only consisting of punctuation characters are removed as well. Only tokens that consist of more than one letter are being kept. Parameters ---------- text : `str`. A sentence that is to split into words. Returns ---------- no_punct : `str`. A list of tokens after stemming and removing Sentence punctuation patterns. """ tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(text) bengali_text_tokenize return bengali_text_tokenize
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing Sanskrit sentences.""" text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।। न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।""" target = ['श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।','यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।', 'न मे विदुः सुरगणाः प्रभवं न महर्षयः।', 'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।'] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def preprocess_doc( sent, params={ 'remove_numbers': False, 'remove_emoji': True, 'remove_stop_words': True, 'tokenize': True }): '''This function should implememnt a multi-lingual tokenizer ''' '''input: a document / sentence , params is a dict of control sequence''' '''output: should return a token list for the entire document/sentence''' sent = emoji.demojize(sent) sent = re.sub(r"http\S+", '', sent) sent = re.sub(r"www.\S+", '', sent) if (params['remove_numbers'] == True): sent = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", sent) sent = re.sub(r"/-", " ", sent) sent = re.sub(r"#,\,", " ", sent) tokenizer = TokenizeSentence('hindi') sents = tokenizer.tokenize(sent) all_sents = [] for s in sents: if (params['remove_emoji'] == True): s = re.sub(r":\S+:", "", s) else: s = re.sub(r"[:\*]", "", s) punc = set(punctuation) - set('.') newtext = [] for k, g in groupby(s): if k in punc: newtext.append(k) else: newtext.extend(g) s = ''.join(newtext) s = re.sub('[' + re.escape(''.join(puncts)) + ']', '', s) s = s.lower() if (params['tokenize'] == True): msg = tok.tokenize(s) else: msg = s if ((params['tokenize'] == True) and (params['remove_stop_words'] == True)): msg_filtered = [word for word in msg if word not in stop_for_this] else: msg_filtered = msg if (len(msg_filtered) > 0): all_sents.append(msg_filtered) return all_sents
def test_classical_hindi_stops(self): """ Test filtering classical hindi stopwords Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt) """ sentence = " वह काबुली फिर वहां आकर खडा हो गया है " tokenizer = TokenizeSentence('hindi') tokens = tokenizer.tokenize(sentence) no_stops = [word for word in tokens if word not in HINDI_STOPS] target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया'] self.assertEqual(no_stops, target_list)
def createCorpus(text, save=True): ''' :params text - the raw text returns + the corpus, a list of list with tokenized sentences + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words. ''' with open('../../data/stopwords.txt', 'r', encoding="UTF-8") as src: stopwords = src.read() stopwords = stopwords.split('\n') stopwords.extend([".", ",", "?", "!", "-", ":", ";", "·"]) Stokenizer = TokenizeSentence('greek') Wtokenizer = WordTokenizer('greek') sentences = Stokenizer.tokenize(text) new_sentences = [] vocab = dict() print('Building corpus and freqDictionary') for sent in tqdm(sentences, desc="Sentences"): new_sent = Wtokenizer.tokenize(sent) # Stopword deletion new_sent = [w for w in new_sent if w not in stopwords] new_sentences.append(new_sent) for w in new_sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 vocab_size = len(vocab) for k, v in vocab.items(): # Subsampling, see paper by Goldberg & Levy frac = v / vocab_size p_w = (1 + np.sqrt(frac * 0.001)) * 0.001 / frac # update the value for the word vocab[k] = p_w if save: print('Saving the frequencies') with open('../../data/vocabularies/Homer_word_frequencies.json', 'w', encoding='utf-8') as fp: json.dump(vocab, fp, ensure_ascii=False) print('Saving the corpus') arr = np.array(new_sentences, dtype=object) np.save('../../data/Homer_tokenized_corpus.npy', arr) return new_sentences, vocab
def bangla_tokenize(text): """Gets the spreadsheet's header column named 'bengali_version' and toeknize each text based on that particular grammar" Parameters ---------- text : str The texts retrieved from the spreadsheet Returns ------- list a list of tokens """ x = [] for line in text: tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(line) x.insert(0, bengali_text_tokenize) return x[::-1]
def createCorpus(text, save=True): ''' :params text - the raw text returns + the corpus, a list of list with tokenized sentences + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words. ''' # load stopwords with open('../data/stopwords.txt', 'r', encoding="UTF-8") as src: stopwords = src.read() # add punctuation signs stopwords = stopwords.split('\n') stopwords.extend( [".", ",", "?", "!", "-", ":", ";", "·", "”", "“", "«", "»"]) # tokenize sentences and then words Stokenizer = TokenizeSentence('greek') Wtokenizer = WordTokenizer('greek') sentences = Stokenizer.tokenize(text) new_sentences = [] vocab = dict() print('Building corpus and freqDictionary') total_tokens = 0 check = 0 # for each sentence for sent in tqdm(sentences, desc="Sentences"): # extract the words new_sent = Wtokenizer.tokenize(sent) check += len(new_sent) # Stopword deletion new_sent = [w for w in new_sent if w not in stopwords] new_sentences.append(new_sent) total_tokens += len(new_sent) # add each word to dictionary or update count for w in new_sent: # Increment tokens count if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 vocab_size = len(vocab) print("total tokens: ", total_tokens) print("total token (incl. stopwords)", check) print("vocab_size : ", vocab_size) # Subsampling treshold = 10e-05 for k, v in vocab.items(): # http: // mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/ # Not really used for subsampling here but to generate the noise distribution frac = v / total_tokens p_w = (1 + math.sqrt(frac / treshold)) * (treshold / frac) vocab[k] = p_w if save: print('Saving the frequencies') with open(args.word_frequencies, 'w', encoding='utf-8') as fp: json.dump(vocab, fp, ensure_ascii=False) print('Saving the corpus') arr = np.array(new_sentences, dtype=object) np.save('../data/Homer_tokenized_accented.npy', arr) with open('../data/vocabs/Homer_wordList.csv', "w", encoding="utf-8") as fp: for idx, word in tqdm(enumerate(vocab)): fp.write(str(idx) + "," + word + "\n") return new_sentences, vocab
def tokenizer(str): tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(str) return bengali_text_tokenize
def text_file_read(book_name, choice): global L1 File1_Open = open(str(book_name), encoding="utf8") Txt_List = [] for Line_in_File in File1_Open: Txt_List.append(Line_in_File) #print(len(Txt_List)) s = (" ".join(Txt_List)) if choice != 1: text = re.sub("[?!]", "ред", s) #print("[>>>",len(text)) text = re.sub("[ ]+", " ", text) text = re.sub("[\n]+", "\n", text) text = re.sub("[\t]+", "", text) text = "".join([s for s in text.strip().splitlines(True) if s.strip()]) with open(str(book_name), "w", encoding="utf-8") as f: f.write(text) tokenizer = TokenizeSentence('hindi') l1 = tokenizer.tokenize(text) l11 = [] for po in range(len(l1)): kl = re.split("ред", str(l1[po])) l11.extend(kl) """for ind, i in enumerate(l1): text=i s2="" tlist = [] tlist1 = [] #print(text) try: driver.get("https://translate.google.com/#view=home&op=translate&sl=hi&tl=en&text={}".format(text)) time.sleep(1) try: content = driver.find_element_by_css_selector('.tlid-translation.translation') # tlid-translation.translation .gt-baf-table txt = content.text.split('\n') for t in txt: if re.sub('[^A-Za-z ]', '', t): tlist1.append(t) s2=(" ".join(tlist1)) l2.append(s2) except Exception as e: l2.append(s2) print(e) if ind % 10 == 0: print(ind) Data_frame_Translation=pd.DataFrame() Data_frame_Translation['Hindi']=l1[:ind+1] Data_frame_Translation['Translated_Hindi']=l2 Data_frame_Translation.to_excel(str(book_name)+'.xlsx',index=False) except: l2.append(s2) driver = webdriver.PhantomJS('C:/Users/User/Downloads/phantomjs-2.1.1-windows/bin/phantomjs')""" Data_frame_Translation = pd.DataFrame() Data_frame_Translation['Hindi'] = l1 L1.extend(l1) Data_frame_Translation.to_excel(str(book_name) + '.xlsx', index=False) else: text = re.sub("[ ]+", " ", s) text = re.sub("[\n]+", "\n", text) text = re.sub("[\t]+", "", text) text = "".join([s for s in text.strip().splitlines(True) if s.strip()]) with open(str(book_name), "w", encoding="utf-8") as f: f.write(text) l1 = sent_tokenize(text) Data_frame_Translation = pd.DataFrame() Data_frame_Translation['English'] = l1 Data_frame_Translation.to_excel(str(book_name) + '.xlsx', index=False)
from translate import translator #nltk.download() from nltk.tokenize import sent_tokenize, word_tokenize from openpyxl.workbook import Workbook e=[] h=[] df=pd.read_excel("E:/hindi_english_downloaded_split/english_corpora/maths/6th/train_paragraph_level.xlsx") print(df.columns) tokenizer = TokenizeSentence('hindi') for index, row in df.iterrows(): s=str(row["Hindi"]) s1=str(row["English"]) s=re.sub("[!?.]","ред",s) l1=tokenizer.tokenize(s) l2=sent_tokenize(s1) if len(l1)>len(l2): for i in range(abs(len(l1)-len(l2))): l2.append("None") elif len(l1)<len(l2): for i in range(abs(len(l1)-len(l2))): l1.append("None") e.extend(l2) h.extend(l1) df=pd.read_excel("E:/hindi_english_downloaded_split/english_corpora/maths/6th/missmatched_section.xlsx") print(df.columns)
infile = open(infilename, "r") string = infile.readline() while string: string = string[:-1] filestring.append(string) string = infile.readline() infile.close() outfile = open(outfilename, "w") for shlok in filestring: #picking one shloka from the file t_shlok = tokenizer.tokenize(shlok) #initializing the flags count = 0 # to count the number of phonemes after which the split has to be done pos = 0 # to insert the - diff = 0 # to keep track of the overflow phonemes for i in range(len(t_shlok)): token = t_shlok[i] split = syl.orthographic_syllabify(token) l = len(split) # phonemes already covered prev = count #checking for purna-viram and numbers if l == 1 and check_token(token) == False:
def candidate_words(self, stripped_input): from cltk.tokenize.sentence import TokenizeSentence tokenizer = TokenizeSentence('bengali') tokens = tokenizer.tokenize(stripped_input) return tokens
t = Tokenizer() t.read_from_file(PATH + file + '/' + inner_file + '/' + inner_inner_file) split_shit = t.generate_sentences() final_split_shit = [] for i in split_shit: hello = re.split('\?|\!', i) for k in hello: final_split_shit.append(k) filtered_final_split_shit = [] for i in final_split_shit: if (not (bool(re.match('^\s+$', i)))): filtered_final_split_shit.append(i) words = [] for i in filtered_final_split_shit: sentence_tokenized = tokenizer.tokenize(i) for k in sentence_tokenized: words.append(k.strip('\n')) length = [ len(tokenizer.tokenize(i)) for i in filtered_final_split_shit ] one = statistics.mean(length) two = statistics.stdev(length) vocabulary = set(words) three = len(vocabulary) / len(words) feature = [] feature = [one, two, three] features.append(feature) file = open("../pickle/features.pkl", 'wb')