def test_sentence_tokenizer_latin(self): """Test tokenizing Latin sentences.""" text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem." # pylint: disable=line-too-long target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.'] # pylint: disable=line-too-long tokenizer = TokenizeSentence('latin') tokenized_sentences = tokenizer.tokenize_sentences(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_marathi(self): """Test tokenizing marathi sentences.""" text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥" target = ['अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये', 'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं', 'तेषां', 'के', 'योगवित्तमाः', '॥'] tokenizer = TokenizeSentence('marathi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def _tokenize(self, text): """ Use NLTK's standard tokenizer, rm punctuation. :param text: pre-processed text :return: tokenized text :rtype : list """ sentence_tokenizer = TokenizeSentence('latin') sentences = sentence_tokenizer.tokenize_sentences(text.lower()) sent_words = [] punkt = PunktLanguageVars() for sentence in sentences: words = punkt.word_tokenize(sentence) assert isinstance(words, list) words_new = [] for word in words: if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations: # pylint: disable=line-too-long words_new.append(word) # rm all numbers here with: re.compose(r'[09]') sent_words.append(words_new) return sent_words
def test_sentence_tokenizer_latin(self): """Test tokenizing Latin sentences.""" sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum." # pylint: disable=line-too-long good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.'] # pylint: disable=line-too-long tokenizer = TokenizeSentence('latin') tokenized_sentences = tokenizer.tokenize_sentences(sentences) self.assertEqual(tokenized_sentences, good_tokenized_sentences)
def cleaning_data(str): tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(str) # print(bengali_text_tokenize) cleaned = clean(bengali_text_tokenize) cleaned = ' '.join(cleaned) return cleaned
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing sanskrit sentences.""" text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।" target = ['श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ', 'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च', '।', '।'] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_telugu(self): """Test tokenizing telugu sentences.""" text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు." target = ['తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను', 'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన', 'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు', 'ధనికుని', 'యింటికేతెంచుచుందురు', '.'] tokenizer = TokenizeSentence('telugu') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_classical_hindi(self): """Test tokenizing classical_hindi sentences.""" text = "जलर् चिकित्सा से उन्हें कोई लाभ नहीं हुआ।" target = ['जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।'] tokenizer = TokenizeSentence('hindi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def tokenizing(): matched_englist_list = [] matched_hindi_list = [] non_matched_englist_list = [] non_matched_hindi_list = [] global dat_frame_Matched global dat_frame_Not_Matched global final_df_matched global final_df_non_matched global count_sentence_positive global count_sentence_negative global total_section global count_section_average global total_section_mis_match_english global count_section_average_mis_match_english global total_section_mis_match_hindi global count_section_average_mis_match_hindi tokenizer = TokenizeSentence('hindi') for index, row in dat_frame_Matched.iterrows(): #print(row['name'], row['age']): #print(row["Hindi"],row["English"]) #row["Hindi"]=re.sub("\d+\.","",row["Hindi"]) #row["English"]=re.sub("\d+\.","",row["English"]) l1 = tokenizer.tokenize(row["Hindi"]) l2 = sent_tokenize(row["English"]) #print(len(l1)==len(l2)) if len(l1) == len(l2): total_section = total_section + len(l2) count_section_average = count_section_average + 1 count_sentence_positive = count_sentence_positive + len(l2) matched_englist_list.extend(l2) matched_hindi_list.extend(l1) else: total_section_mis_match_english = total_section_mis_match_english + len( l2) count_section_average_mis_match_english = count_section_average_mis_match_english + 1 total_section_mis_match_hindi = total_section_mis_match_hindi + len( l1) count_section_average_mis_match_hindi = count_section_average_mis_match_hindi + 1 #print(l1,l2) count_sentence_negative = count_sentence_negative + len(l1) non_matched_englist_list.append(row["English"]) non_matched_hindi_list.append(row["Hindi"]) for index, row in dat_frame_Not_Matched.iterrows(): hind = ' '.join(map(str, row["Hindi"])) englsh = ' '.join(map(str, row["English"])) l1 = re.split("।", hind) l2 = sent_tokenize(englsh) if len(l1) == len(l2): count_sentence_positive = count_sentence_positive + len(l2) matched_englist_list.extend(l2) matched_hindi_list.extend(l1) else: c = count_sentence_negative + len(l2) non_matched_englist_list.append(englsh) non_matched_hindi_list.append(hind) final_df_matched['English'] = matched_englist_list final_df_matched['Hindi'] = matched_hindi_list final_df_non_matched['Hindi'] = non_matched_hindi_list final_df_non_matched['English'] = non_matched_englist_list
def test_sentence_tokenizer_bengali(self): """Test tokenizing bengali sentences.""" text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।" target = ['দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে', 'বেশ', 'সুখে', 'আছেন', '।'] tokenizer = TokenizeSentence('bengali') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def porter_tokenizer(text): """ A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) and applies the porter stemming algorithm to each of the obtained token. Tokens that are only consisting of punctuation characters are removed as well. Only tokens that consist of more than one letter are being kept. Parameters ---------- text : `str`. A sentence that is to split into words. Returns ---------- no_punct : `str`. A list of tokens after stemming and removing Sentence punctuation patterns. """ tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(text) bengali_text_tokenize return bengali_text_tokenize
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing Sanskrit sentences.""" text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।। न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।""" target = ['श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।','यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।', 'न मे विदुः सुरगणाः प्रभवं न महर्षयः।', 'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।'] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def preprocess_doc( sent, params={ 'remove_numbers': False, 'remove_emoji': True, 'remove_stop_words': True, 'tokenize': True }): '''This function should implememnt a multi-lingual tokenizer ''' '''input: a document / sentence , params is a dict of control sequence''' '''output: should return a token list for the entire document/sentence''' sent = emoji.demojize(sent) sent = re.sub(r"http\S+", '', sent) sent = re.sub(r"www.\S+", '', sent) if (params['remove_numbers'] == True): sent = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", sent) sent = re.sub(r"/-", " ", sent) sent = re.sub(r"#,\,", " ", sent) tokenizer = TokenizeSentence('hindi') sents = tokenizer.tokenize(sent) all_sents = [] for s in sents: if (params['remove_emoji'] == True): s = re.sub(r":\S+:", "", s) else: s = re.sub(r"[:\*]", "", s) punc = set(punctuation) - set('.') newtext = [] for k, g in groupby(s): if k in punc: newtext.append(k) else: newtext.extend(g) s = ''.join(newtext) s = re.sub('[' + re.escape(''.join(puncts)) + ']', '', s) s = s.lower() if (params['tokenize'] == True): msg = tok.tokenize(s) else: msg = s if ((params['tokenize'] == True) and (params['remove_stop_words'] == True)): msg_filtered = [word for word in msg if word not in stop_for_this] else: msg_filtered = msg if (len(msg_filtered) > 0): all_sents.append(msg_filtered) return all_sents
def test_sentence_tokenizer_bengali(self): """Test tokenizing bengali sentences.""" text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।" target = [ 'দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে', 'বেশ', 'সুখে', 'আছেন', '।' ] tokenizer = TokenizeSentence('bengali') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_classical_hindi(self): """Test tokenizing classical_hindi sentences.""" text = "जलर् चिकित्सा से उन्हें कोई लाभ नहीं हुआ।" target = [ 'जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।' ] tokenizer = TokenizeSentence('hindi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_classical_hindi_stops(self): """ Test filtering classical hindi stopwords Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt) """ sentence = " वह काबुली फिर वहां आकर खडा हो गया है " tokenizer = TokenizeSentence('hindi') tokens = tokenizer.tokenize(sentence) no_stops = [word for word in tokens if word not in HINDI_STOPS] target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया'] self.assertEqual(no_stops, target_list)
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing sanskrit sentences.""" text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।" target = [ 'श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ', 'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च', '।', '।' ] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_marathi(self): """Test tokenizing marathi sentences.""" text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥" target = [ 'अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये', 'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं', 'तेषां', 'के', 'योगवित्तमाः', '॥' ] tokenizer = TokenizeSentence('marathi') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_telugu(self): """Test tokenizing telugu sentences.""" text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు." target = [ 'తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను', 'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన', 'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు', 'ధనికుని', 'యింటికేతెంచుచుందురు', '.' ] tokenizer = TokenizeSentence('telugu') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def test_sentence_tokenizer_sanskrit(self): """Test tokenizing Sanskrit sentences.""" text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।। न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।""" target = [ 'श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।', 'यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।', 'न मे विदुः सुरगणाः प्रभवं न महर्षयः।', 'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।' ] tokenizer = TokenizeSentence('sanskrit') tokenized_sentences = tokenizer.tokenize(text) self.assertEqual(tokenized_sentences, target)
def createCorpus(text, save=True): ''' :params text - the raw text returns + the corpus, a list of list with tokenized sentences + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words. ''' with open('../../data/stopwords.txt', 'r', encoding="UTF-8") as src: stopwords = src.read() stopwords = stopwords.split('\n') stopwords.extend([".", ",", "?", "!", "-", ":", ";", "·"]) Stokenizer = TokenizeSentence('greek') Wtokenizer = WordTokenizer('greek') sentences = Stokenizer.tokenize(text) new_sentences = [] vocab = dict() print('Building corpus and freqDictionary') for sent in tqdm(sentences, desc="Sentences"): new_sent = Wtokenizer.tokenize(sent) # Stopword deletion new_sent = [w for w in new_sent if w not in stopwords] new_sentences.append(new_sent) for w in new_sent: if w not in vocab: vocab[w] = 1 else: vocab[w] += 1 vocab_size = len(vocab) for k, v in vocab.items(): # Subsampling, see paper by Goldberg & Levy frac = v / vocab_size p_w = (1 + np.sqrt(frac * 0.001)) * 0.001 / frac # update the value for the word vocab[k] = p_w if save: print('Saving the frequencies') with open('../../data/vocabularies/Homer_word_frequencies.json', 'w', encoding='utf-8') as fp: json.dump(vocab, fp, ensure_ascii=False) print('Saving the corpus') arr = np.array(new_sentences, dtype=object) np.save('../../data/Homer_tokenized_corpus.npy', arr) return new_sentences, vocab
def tokenize(self, mode='word'): """Tokenizes the passage into lists of words or sentences. Breaks text words into individual tokens (strings) by default. If mode is set to sentence, returns lists of sentences. Args: mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence' Returns: :obj:`list` of :obj:`str` Tokenized words (or sentences) Example: >>> LatinText('Gallia est omnis divisa in partes tres').tokenize() ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres'] """ from cltk.tokenize.word import nltk_tokenize_words from cltk.tokenize.sentence import TokenizeSentence if mode == 'sentence': return TokenizeSentence( self.options['language'] ).tokenize_sentences(self.data) else: return nltk_tokenize_words(self.data)
def sentence_tokenizer(self): hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize( self.sentence) # print(hindi_text_sentence_tokenize) print("\nHindi sentence tokenize") for i in hindi_text_sentence_tokenize: print(i)
class TextSummarization: sentence = "" hindi_stop_words = set(STOPS_LIST) sentence_tokenizer = TokenizeSentence('hindi') def __init__(self, hindi_text_input): self.sentence = hindi_text_input def word_tokenize(self): word_tokenizer = WordTokenizer('sanskrit') return word_tokenizer.tokenize(self.sentence) # print(hindi_text_words def sentence_tokenizer(self): hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize( self.sentence) # print(hindi_text_sentence_tokenize) print("\nHindi sentence tokenize") for i in hindi_text_sentence_tokenize: print(i) def get_stop_words(self): return self.hindi_stop_words def print_stop_words(self): print("Stop words:", self.get_stop_words()[:10]) def get_filtered_sentence(self): filtered_sentence = [] for w in self.word_tokenize(): if w not in self.get_stop_words(): filtered_sentence = filtered_sentence + w.split() print("filtered sentence:", filtered_sentence)
def get_corpus_reader(corpus_name: str = None, language: str = None) -> CorpusReader: """ Corpus reader factory method :param corpus_name: the name of the supported corpus, available as: [package].SUPPORTED_CORPORA :param langugage: the language for search in :return: NLTK compatible corpus reader """ BASE = '~/cltk_data/{}/text'.format(language) root = os.path.join(os.path.expanduser(BASE), corpus_name) if not os.path.exists(root) or corpus_name not in SUPPORTED_CORPORA.get( language): raise ValueError( 'Specified corpus data not found, please install {} for language: {}' .format(corpus_name, language)) sentence_tokenizer = TokenizeSentence(language) the_word_tokenizer = WordTokenizer(language) DOC_PATTERN = r'.*\.txt' #: Generic file ending, override below in your own CorpusReader implementation if language == 'latin': if corpus_name == 'latin_text_latin_library': skip_keywords = ['Latin', 'Library'] return FilteredPlaintextCorpusReader( root=root, fileids=DOC_PATTERN, sent_tokenizer=sentence_tokenizer, word_tokenizer=the_word_tokenizer, skip_keywords=skip_keywords) if corpus_name == 'latin_text_perseus': pass
def get_corpus_reader(corpus_name: str = None, language: str = None) -> CorpusReader: """ Corpus reader factory method :param corpus_name: the name of the supported corpus, available as: [package].SUPPORTED_CORPORA :param langugage: the language for search in :return: NLTK compatible corpus reader """ BASE = '~/cltk_data/{}/text'.format(language) root = os.path.join(os.path.expanduser(BASE), corpus_name) if not os.path.exists(root) or corpus_name not in SUPPORTED_CORPORA.get( language): raise ValueError( 'Specified corpus data not found, please install {} for language: {}' .format(corpus_name, language)) sentence_tokenizer = TokenizeSentence(language) the_word_tokenizer = WordTokenizer(language) doc_pattern = r'.*\.txt' #: Generic file ending, override below in your own CorpusReader implementation if language == 'latin': if corpus_name == 'latin_text_latin_library': skip_keywords = ['Latin', 'Library'] return FilteredPlaintextCorpusReader( root=root, fileids=doc_pattern, sent_tokenizer=sentence_tokenizer, word_tokenizer=the_word_tokenizer, skip_keywords=skip_keywords) if corpus_name == 'latin_text_perseus': valid_json_root = os.path.join( root, 'cltk_json') #: we only support this subsection return JsonfileCorpusReader( root=valid_json_root, sent_tokenizer=sentence_tokenizer, word_tokenizer=the_word_tokenizer, target_language='latin') # perseus also contains English if language == 'greek': if corpus_name == 'greek_text_perseus': valid_json_root = os.path.join( root, 'cltk_json') #: we only support this subsection return JsonfileCorpusReader( root=valid_json_root, sent_tokenizer=sentence_tokenizer, word_tokenizer=the_word_tokenizer, target_language='grc') #: this abbreviation is required if corpus_name == 'greek_text_tesserae': # tokenizers/taggers need to be replaced with CLTK version # most obv. for POS tagging! return TesseraeCorpusReader( root=root, fileids=r'.*\.tess', sent_tokenizer=sent_tokenize, word_tokenizer=word_tokenize, pos_tagger=pos_tag, target_language='grc') #: this abbreviation is required
def compare_sentences(self, str_a, str_b, language): """Tokenize two input strings on sentence boundary and return a matrix of Levenshtein distance ratios. :param language: str (language name) :param string_a: str :param string_b: str :return: list [[Comparison]] """ sents_a = [] sents_b = [] ratios = [] # Make the latin tokenizer if language == "latin": sent_tokenizer = TokenizeSentence('latin') # Make the greek tokenizer elif language == "greek": sent_tokenizer = TokenizeSentence('greek') # Otherwise, if language, is unsupported, throw error stating accepted Language # values that may be used to tokenize sentences else: print("Language for sentence tokenization not recognized. " "Accepted values are 'latin' and 'greek'.") return # If class instance is set to stem words, do so if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) # Tokenize input strings sents_a = sent_tokenizer.tokenize_sentences(str_a) sents_b = sent_tokenizer.tokenize_sentences(str_b) # Process sentences for comparison (taking into account sanitization settings) sents_a = self._process_sentences(sents_a) sents_b = self._process_sentences(sents_b) # Build matrix of edit distance ratios comparisons = self._calculate_ratios(sents_a, sents_b) return comparisons
def bangla_tokenize(text): """Gets the spreadsheet's header column named 'bengali_version' and toeknize each text based on that particular grammar" Parameters ---------- text : str The texts retrieved from the spreadsheet Returns ------- list a list of tokens """ x = [] for line in text: tokenizer = TokenizeSentence('bengali') bengali_text_tokenize = tokenizer.tokenize(line) x.insert(0, bengali_text_tokenize) return x[::-1]
def tokenizing(hindi,english): matched_englist_list=[] matched_hindi_list=[] non_matched_englist_list=[] non_matched_hindi_list=[] global dat_frame_Matched global dat_frame_Not_Matched global final_df_matched global final_df_non_matched global count_sentence_positive global count_sentence_negative global total_section global count_section_average global total_section_mis_match_english global count_section_average_mis_match_english global total_section_mis_match_hindi global count_section_average_mis_match_hindi tokenizer = TokenizeSentence('hindi') for i in range(len(hindi)): l1=tokenizer.tokenize(hindi[i]) l2=sent_tokenize(english[i]) #print(len(l1)==len(l2)) if len(l1)==len(l2): total_section=total_section+len(l2) count_section_average=count_section_average+1 count_sentence_positive =count_sentence_positive+len(l2) matched_englist_list.extend(l2) matched_hindi_list.extend(l1) else: total_section_mis_match_english=total_section_mis_match_english+len(l2) count_section_average_mis_match_english=count_section_average_mis_match_english+1 total_section_mis_match_hindi=total_section_mis_match_hindi+len(l1) count_section_average_mis_match_hindi=count_section_average_mis_match_hindi+1 #print(l1,l2) count_sentence_negative=count_sentence_negative+len(l1) non_matched_englist_list.append(english[i]) non_matched_hindi_list.append(hindi[i]) final_df_matched['English']=matched_englist_list final_df_matched['Hindi']=matched_hindi_list
class Tokenizer(object): def __init__(self): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') self.tokenizer = TokenizeSentence('greek') def calc_word_freq(self, data): word_dict = {} freq_dict = {} words = data.split() total_word = 0 for word in words: if word in STOPS_LIST: continue if word not in word_dict: word_dict[word] = 1 else: word_dict[word] += 1 total_word += 1 for key in word_dict.keys(): freq_dict[key] = word_dict[key] / float(total_word) return freq_dict def tokenize_sentence(self, data): sentence_dict = {} sentences = self.tokenizer.tokenize_sentences(data) word_frequency = 0 freq_dict = self.calc_word_freq(data) for i, sentence in enumerate(sentences): words = sentence.split() for word in words: if word in STOPS_LIST: continue word_frequency += freq_dict[ word] if word in freq_dict else 0.00000000000000000001 len_words = len(words) calc = word_frequency / len_words sentence_dict[sentence] = ((calc, len_words), i) return sentence_dict
import nltk from cltk.tokenize.sentence import TokenizeSentence from cltk.tokenize.word import WordTokenizer from collections import Counter from IPython.display import Image from cltk.stop.latin import STOPS_LIST # See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit." cato_agri_praef_lowered = cato_agri_praef.lower() # create a tokenizer instance of the TokenizeSentence Class latin_sentence_tokenizer = TokenizeSentence('latin') #tokenize the text into sentence tokens cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences( cato_agri_praef) # tokenize the text (or specific sentences) into specific words latin_word_tokenizer = WordTokenizer('latin') cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered) cato_word_tokens_WO_punt = [ token for token in cato_word_tokens if token not in ['.', ',', ':', ';'] ] #print the tokens and the number of tokens num_of_sentences = len(cato_sentence_tokens) num_of_words = len(cato_word_tokens_WO_punt) #print("There are " + str(num_of_sentences) + " sentences in the text") #print("There are " + str(num_of_words) + " words in the text") # for sentence in cato_sentence_tokens:
def sentence_tokenizer(text): text.replace(".", " | ") text.replace("\n", "").strip() print("Sentence Tokenizer triggered") hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize(text) return hindi_text_sentence_tokenize
#nltk.download() from nltk.tokenize import sent_tokenize, word_tokenize from openpyxl.workbook import Workbook from selenium import webdriver import time import pickle import random dir0='F:/hindi_english_downloaded_split/epub/english_corpora' os.chdir(dir0) total_file=0 list_dir0=os.listdir() file_low_list=[] English_j=[] Hindi_j=[] pos_sec=0 tokenizer = TokenizeSentence('hindi') total=0 #trans_df=pd.read_excel('F:/hindi_english_downloaded_split/Hindi_all_sentences_trans.xlsx') for io in range(len(list_dir0)): dir1=dir0+'/'+str(list_dir0[io]) os.chdir(dir1) list_dir=os.listdir() for j in range(len(list_dir)): English_1=[] Hindi_1=[] dir2=dir1+'/'+str(list_dir[j]) os.chdir(dir2) df_section=pd.read_excel('paragraph_section_to_section_combined.xlsx') df_sentence=pd.read_excel('train_sentence_level.xlsx') for i in range(len(df_section)): s=str(df_section["Hindi"][i])
def tokenizing(): matched_englist_list = [] matched_hindi_list = [] non_matched_englist_list = [] non_matched_hindi_list = [] global dat_frame_Matched global dat_frame_Not_Matched global final_df_matched global final_df_non_matched global count_sentence_positive global count_sentence_negative global total_section global count_section_average global total_section_mis_match_english global count_section_average_mis_match_english global total_section_mis_match_hindi global count_section_average_mis_match_hindi tokenizer = TokenizeSentence('hindi') for index, row in dat_frame_Matched.iterrows(): #print(row['name'], row['age']): #print(row["Hindi"],row["English"]) #row["Hindi"]=re.sub("\d+\.","",row["Hindi"]) #row["English"]=re.sub("\d+\.","",row["English"]) l1 = tokenizer.tokenize(row["Hindi"]) l2 = sent_tokenize(row["English"]) #print(len(l1)==len(l2)) if len(l1) == len(l2): total_section = total_section + len(l2) count_section_average = count_section_average + 1 count_sentence_positive = count_sentence_positive + len(l2) matched_englist_list.extend(l2) matched_hindi_list.extend(l1) else: total_section_mis_match_english = total_section_mis_match_english + len( l2) count_section_average_mis_match_english = count_section_average_mis_match_english + 1 total_section_mis_match_hindi = total_section_mis_match_hindi + len( l1) count_section_average_mis_match_hindi = count_section_average_mis_match_hindi + 1 #print(l1,l2) count_sentence_negative = count_sentence_negative + len(l1) """print(len(l1),len(l2)) for j in range(min(len(l1),len(l2))): matched_englist_list.append(l2[j]) matched_hindi_list.append(l1[j])""" non_matched_englist_list.append(row["English"]) non_matched_hindi_list.append(row["Hindi"]) for index, row in dat_frame_Not_Matched.iterrows(): hind = ' '.join(map(str, row["Hindi"])) englsh = ' '.join(map(str, row["English"])) l1 = re.split("।", hind) l2 = sent_tokenize(englsh) if len(l1) == len(l2): count_sentence_positive = count_sentence_positive + len(l2) matched_englist_list.extend(l2) matched_hindi_list.extend(l1) else: c = count_sentence_negative + len(l2) """print(len(l1),len(l2)) for j in range(min(len(l1),len(l2))): matched_englist_list.append(l2[j]) matched_hindi_list.append(l1[j])""" non_matched_englist_list.append(englsh) non_matched_hindi_list.append(hind) final_df_matched['English'] = matched_englist_list final_df_matched['Hindi'] = matched_hindi_list final_df_non_matched['Hindi'] = non_matched_hindi_list final_df_non_matched['English'] = non_matched_englist_list translator = Translator() translated_english = [] try: translations = translator.translate(matched_hindi_list, dest='en') for translation in translations: try: translated_english.append(translation.text) except: translated_english.append("None") except: for translation in matched_hindi_list: try: translated_english.append(translation.text) except: translated_english.append("None") final_df_matched['Translated'] = translated_english
def scrap_doc(): #scraping table regex = re.compile('[%s]' % re.escape(string.punctuation)) tokenizer_latin = TokenizeSentence('latin') directory="dataset/dbg" if not os.path.exists(directory): os.makedirs(directory) for i in range (1,9): url="http://sacred-texts.com/cla/jcsr/dbg"+str(i)+".htm" html = urllib.urlopen(url) soup = BeautifulSoup(html) #create text file target_e = open("dataset/dbg/dbg"+str(i)+"_eng.txt", 'w') target_l = open("dataset/dbg/dbg"+str(i)+"_lat.txt", 'w') #to remove <a></a> for tag in soup.find_all('a'): tag.replaceWith('') k=0 for tr in soup.find_all('tr')[0:]: k=k+1 tds = tr.find_all('td') col1=tds[0].text col2=tds[1].text col1_tok=tokenize.sent_tokenize(col1) #col2_tok=tokenize.sent_tokenize(col2) col2_tok=tokenizer_latin.tokenize_sentences(col2) no_sentences_eng=0 #writing sentences to a file for l in range(len(col1_tok)): line=col1_tok[l] #line=regex.sub('', line).strip() if line!="": #line+='.' target_e.write((line.lower()).encode('utf-8')) target_e.write("\n") no_sentences_eng+=1 no_sentences_lat=0 for l in range(len(col2_tok)): line=col2_tok[l] #line=regex.sub('', line).strip() if line!="": #line+='.' target_l.write((line.lower()).encode('utf-8')) target_l.write("\n") no_sentences_lat+=1 if no_sentences_eng!=no_sentences_lat: print ("wrong ",i,k," :",(no_sentences_eng) ,(no_sentences_lat))
def randomizer(authors, titles, texts, sample_size, test_dict, n_samples, smooth_test): """ |--- Function for randomly sampling from texts ---| ::: Authors, Titles, Texts ::: """ sampled_authors = [] sampled_titles = [] sampled_texts = [] # Make train-test dict # Texts under the same author name are collected in one pool and then randomized pooled_dict = {author: [] for author in authors} for author, title, text in zip(authors, titles, texts): if author in pooled_dict: pooled_dict[author].append((title, text)) # Instantiate cltk Tokenizer tokenizer = TokenizeSentence('latin') for author in pooled_dict: # Pool together texts by same author pooled_titles = [tup[0] for tup in pooled_dict[author]] pooled_texts = [tup[1] for tup in pooled_dict[author]] if author in test_dict and test_dict[author] in pooled_titles and smooth_test == False: print("::: test set «{} {}» is sampled in ordinary slices :::".format(author, "+".join(pooled_titles))) bulk = [] for ord_text in pooled_texts: for word in ord_text.strip().split(): word = word.lower() word = "".join([char for char in word if char not in punctuation]) word = word.lower() bulk.append(word) # Safety measure against empty strings in samples bulk = [word for word in bulk if word != ""] bulk = [bulk[i:i+sample_size] for i in range(0, len(bulk), sample_size)] for index, sample in enumerate(bulk): if len(sample) == sample_size: sampled_authors.append(author) sampled_titles.append(test_dict[author] + "_{}".format(str(index + 1))) sampled_texts.append(" ".join(sample)) else: # Make short random samples and add to sampled texts # Remove punctuation in the meantime print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, "+".join(pooled_titles))) pooled_texts = " ".join(pooled_texts) pooled_texts = tokenizer.tokenize_sentences(pooled_texts) if len(pooled_texts) < 20: print("-----| ERROR: please check if input texts have punctuation, tokenization returned only {} sentence(s) |-----".format(len(pooled_texts))) break for _ in range(1, n_samples+1): random_sample = [] while len(" ".join(random_sample).split()) <= sample_size: random_sample.append(random.choice(pooled_texts)) for index, word in enumerate(random_sample): random_sample[index] = "".join([char for char in word if char not in punctuation]) random_sample = " ".join(random_sample).split()[:sample_size] sampled_authors.append(author) sampled_titles.append('sample_{}'.format(_)) sampled_texts.append(" ".join(random_sample)) return sampled_authors, sampled_titles, sampled_texts
import re '''Finds the part of the string which comes after the delimiter''' def substring_after(s, delim): return s.partition(delim)[2] '''Finds the part of the string which comes before the delimiter''' def substring_before(s, delim): return s.partition(delim)[0] '''Finds the part of the string which comes between the two delimiter''' def substring_before_after(s, delim1, delim2): temp = s.partition(delim1)[2] return temp.partition(delim2)[0] tokenizer = TokenizeSentence('bengali') f = open("data.txt","r") lines = f.readlines() lines = [x.rstrip() for x in lines] i = 0 tokenized_list = [] pattern = "^([0-9])*\)" bengali_text_tokenize = [] for line in lines: if re.search(pattern,line): tokenized_list.append(bengali_text_tokenize) bengali_text_tokenize = [] bengali_text_tokenize += tokenizer.tokenize(line)
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = WordTokenizer('latin') if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = WordTokenizer('greek') if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [w[1:] if w.startswith('-') else w for w in sentence] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
param = Namespace( raw_text='../data/HomerGesamt_cleaned.txt', stopwords='../data/stopwords.txt', window=15, # quite high but useful for semantic analysis train_prop=0.7, val_prop=0.15, test_prop=0.15, output='../data/Homer_cbow_preprocessed.csv', MASK="<SENT_BOUND>") # load file homer = load_file(param.raw_text) # Sentence tokenizer greek_tokenizer = TokenizeSentence('greek') homer_sentences = greek_tokenizer.tokenize(homer) # clean tokens def clean_delete_stopwords(sentences): ''' :param sentences: a list of sentences :return: the same list whitout stopwords and with spacing after punctuation ''' new_sentences = [] for s in sentences: s = re.sub(r"([.,!?])", r" \1 ", s) tokens = delete_stopwords(stopwords_file=param.stopwords, text=s) tokens = ' '.join(w for w in tokens)