def _tokenize(self, text): """ Use NLTK's standard tokenizer, rm punctuation. :param text: pre-processed text :return: tokenized text :rtype : list """ sentence_tokenizer = TokenizeSentence('latin') sentences = sentence_tokenizer.tokenize_sentences(text.lower()) sent_words = [] punkt = PunktLanguageVars() for sentence in sentences: words = punkt.word_tokenize(sentence) assert isinstance(words, list) words_new = [] for word in words: if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations: # pylint: disable=line-too-long words_new.append(word) # rm all numbers here with: re.compose(r'[09]') sent_words.append(words_new) return sent_words
def test_sentence_tokenizer_latin(self): """Test tokenizing Latin sentences.""" sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum." # pylint: disable=line-too-long good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.'] # pylint: disable=line-too-long tokenizer = TokenizeSentence('latin') tokenized_sentences = tokenizer.tokenize_sentences(sentences) self.assertEqual(tokenized_sentences, good_tokenized_sentences)
def test_sentence_tokenizer_latin(self): """Test tokenizing Latin sentences.""" text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem." # pylint: disable=line-too-long target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.'] # pylint: disable=line-too-long tokenizer = TokenizeSentence('latin') tokenized_sentences = tokenizer.tokenize_sentences(text) self.assertEqual(tokenized_sentences, target)
def compare_sentences(self, str_a, str_b, language): """Tokenize two input strings on sentence boundary and return a matrix of Levenshtein distance ratios. :param language: str (language name) :param string_a: str :param string_b: str :return: list [[Comparison]] """ sents_a = [] sents_b = [] ratios = [] # Make the latin tokenizer if language == "latin": sent_tokenizer = TokenizeSentence('latin') # Make the greek tokenizer elif language == "greek": sent_tokenizer = TokenizeSentence('greek') # Otherwise, if language, is unsupported, throw error stating accepted Language # values that may be used to tokenize sentences else: print("Language for sentence tokenization not recognized. " "Accepted values are 'latin' and 'greek'.") return # If class instance is set to stem words, do so if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) # Tokenize input strings sents_a = sent_tokenizer.tokenize_sentences(str_a) sents_b = sent_tokenizer.tokenize_sentences(str_b) # Process sentences for comparison (taking into account sanitization settings) sents_a = self._process_sentences(sents_a) sents_b = self._process_sentences(sents_b) # Build matrix of edit distance ratios comparisons = self._calculate_ratios(sents_a, sents_b) return comparisons
class Tokenizer(object): def __init__(self): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') self.tokenizer = TokenizeSentence('greek') def calc_word_freq(self, data): word_dict = {} freq_dict = {} words = data.split() total_word = 0 for word in words: if word in STOPS_LIST: continue if word not in word_dict: word_dict[word] = 1 else: word_dict[word] += 1 total_word += 1 for key in word_dict.keys(): freq_dict[key] = word_dict[key] / float(total_word) return freq_dict def tokenize_sentence(self, data): sentence_dict = {} sentences = self.tokenizer.tokenize_sentences(data) word_frequency = 0 freq_dict = self.calc_word_freq(data) for i, sentence in enumerate(sentences): words = sentence.split() for word in words: if word in STOPS_LIST: continue word_frequency += freq_dict[ word] if word in freq_dict else 0.00000000000000000001 len_words = len(words) calc = word_frequency / len_words sentence_dict[sentence] = ((calc, len_words), i) return sentence_dict
import nltk from cltk.tokenize.sentence import TokenizeSentence from cltk.tokenize.word import WordTokenizer from collections import Counter from IPython.display import Image from cltk.stop.latin import STOPS_LIST # See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit." cato_agri_praef_lowered = cato_agri_praef.lower() # create a tokenizer instance of the TokenizeSentence Class latin_sentence_tokenizer = TokenizeSentence('latin') #tokenize the text into sentence tokens cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences( cato_agri_praef) # tokenize the text (or specific sentences) into specific words latin_word_tokenizer = WordTokenizer('latin') cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered) cato_word_tokens_WO_punt = [ token for token in cato_word_tokens if token not in ['.', ',', ':', ';'] ] #print the tokens and the number of tokens num_of_sentences = len(cato_sentence_tokens) num_of_words = len(cato_word_tokens_WO_punt) #print("There are " + str(num_of_sentences) + " sentences in the text") #print("There are " + str(num_of_words) + " words in the text") # for sentence in cato_sentence_tokens: # print(sentence)
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = WordTokenizer('latin') if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = WordTokenizer('greek') if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [w[1:] if w.startswith('-') else w for w in sentence] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
def gen_docs(corpus, lemmatize, rm_stops): """Open and process files from a corpus. Return a list of sentences for an author. Each sentence is itself a list of tokenized words. """ assert corpus in ['phi5', 'tlg'] if corpus == 'phi5': language = 'latin' filepaths = assemble_phi5_author_filepaths() jv_replacer = JVReplacer() text_cleaner = phi5_plaintext_cleanup word_tokenizer = nltk_tokenize_words if rm_stops: stops = latin_stops else: stops = None elif corpus == 'tlg': language = 'greek' filepaths = assemble_tlg_author_filepaths() text_cleaner = tlg_plaintext_cleanup word_tokenizer = nltk_tokenize_words if rm_stops: stops = latin_stops else: stops = None if lemmatize: lemmatizer = LemmaReplacer(language) sent_tokenizer = TokenizeSentence(language) for filepath in filepaths: with open(filepath) as f: text = f.read() # light first-pass cleanup, before sentence tokenization (which relies on punctuation) text = text_cleaner(text, rm_punctuation=False, rm_periods=False) sent_tokens = sent_tokenizer.tokenize_sentences(text) # doc_sentences = [] for sentence in sent_tokens: # a second cleanup at sentence-level, to rm all punctuation sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True) sentence = word_tokenizer(sentence) sentence = [s.lower() for s in sentence] sentence = [w for w in sentence if w] if language == 'latin': sentence = [ w[1:] if w.startswith('-') else w for w in sentence ] if stops: sentence = [w for w in sentence if w not in stops] sentence = [w for w in sentence if len(w) > 1] # rm short words if sentence: sentence = sentence if lemmatize: sentence = lemmatizer.lemmatize(sentence) if sentence and language == 'latin': sentence = [jv_replacer.replace(word) for word in sentence] if sentence: yield sentence
context = remove_extra_white_spaces(context) data_list.append(context) contextFile = open(contextFilePath, 'a+') questionFile = open(questionFilePath, 'a+') answerFile = open(answerFilePath, 'a+') answerSentenceFile = open(answerSentenceFilePath, 'a+') for paragraph in eachData["paragraphs"]: for qa in paragraph["qas"]: # Append to the three files # Answers file -- qa["answers"][0]["text"] -- choosing the first answer answer = qa["answers"][0]["text"] # Find the answer in the context foundFlag = False for sentence in tokenizer.tokenize_sentences(context): if sentence.find(answer) != -1: # Answer found foundFlag = True answerSentenceFile.write(sentence) answerSentenceFile.write('\n') break if not foundFlag: # The answer is not found if context.find(answer) != -1: print("FOUND IN CONTEXT") else: print("NOT FOUND") continue
# -*- coding: utf-8 -*- #from cltk.tokenize.word import WordTokenizer from cltk.tokenize.sentence import TokenizeSentence import re from os import listdir from os.path import isfile, join textfiles = [f for f in listdir('.') if f.endswith('.txt')] #word_tokenizer = WordTokenizer('greek') tokenizer = TokenizeSentence('greek') for file in textfiles: print(file) infile = open(file) text = infile.read() infile.close() for sent in tokenizer.tokenize_sentences(text): if re.search('παραδρ',sent): print(sent)
def randomizer(authors, titles, texts, sample_size, test_dict, n_samples, smooth_test): """ |--- Function for randomly sampling from texts ---| ::: Authors, Titles, Texts ::: """ sampled_authors = [] sampled_titles = [] sampled_texts = [] # Make train-test dict # Texts under the same author name are collected in one pool and then randomized pooled_dict = {author: [] for author in authors} for author, title, text in zip(authors, titles, texts): if author in pooled_dict: pooled_dict[author].append((title, text)) # Instantiate cltk Tokenizer tokenizer = TokenizeSentence('latin') for author in pooled_dict: # Pool together texts by same author pooled_titles = [tup[0] for tup in pooled_dict[author]] pooled_texts = [tup[1] for tup in pooled_dict[author]] if author in test_dict and test_dict[author] in pooled_titles and smooth_test == False: print("::: test set «{} {}» is sampled in ordinary slices :::".format(author, "+".join(pooled_titles))) bulk = [] for ord_text in pooled_texts: for word in ord_text.strip().split(): word = word.lower() word = "".join([char for char in word if char not in punctuation]) word = word.lower() bulk.append(word) # Safety measure against empty strings in samples bulk = [word for word in bulk if word != ""] bulk = [bulk[i:i+sample_size] for i in range(0, len(bulk), sample_size)] for index, sample in enumerate(bulk): if len(sample) == sample_size: sampled_authors.append(author) sampled_titles.append(test_dict[author] + "_{}".format(str(index + 1))) sampled_texts.append(" ".join(sample)) else: # Make short random samples and add to sampled texts # Remove punctuation in the meantime print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, "+".join(pooled_titles))) pooled_texts = " ".join(pooled_texts) pooled_texts = tokenizer.tokenize_sentences(pooled_texts) if len(pooled_texts) < 20: print("-----| ERROR: please check if input texts have punctuation, tokenization returned only {} sentence(s) |-----".format(len(pooled_texts))) break for _ in range(1, n_samples+1): random_sample = [] while len(" ".join(random_sample).split()) <= sample_size: random_sample.append(random.choice(pooled_texts)) for index, word in enumerate(random_sample): random_sample[index] = "".join([char for char in word if char not in punctuation]) random_sample = " ".join(random_sample).split()[:sample_size] sampled_authors.append(author) sampled_titles.append('sample_{}'.format(_)) sampled_texts.append(" ".join(random_sample)) return sampled_authors, sampled_titles, sampled_texts
def randomizer(authors, titles, texts, sample_size, test_dict, n_samples, smooth_test): """ Function for making random samples from texts. Random samples are composed by combining randomly selected sentences. """ sampled_authors = [] sampled_titles = [] sampled_texts = [] # Make train-test dict # Texts under the same author name are collected in one pool and then randomized pooled_dict = {author: [] for author in authors} for author, title, text in zip(authors, titles, texts): if author in pooled_dict: pooled_dict[author].append((title, text)) # Instantiate cltk Tokenizer tokenizer = TokenizeSentence('latin') for author in pooled_dict: # Pool together texts by same author pooled_titles = [tup[0] for tup in pooled_dict[author]] pooled_texts = [tup[1] for tup in pooled_dict[author]] if author in test_dict and test_dict[ author] in pooled_titles and smooth_test == False: print("::: test set «{} {}» is sampled in ordinary slices :::". format(author, "+".join(pooled_titles))) bulk = [] for ord_text in pooled_texts: for word in ord_text.strip().split(): word = word.lower() word = "".join( [char for char in word if char not in punctuation]) word = word.lower() bulk.append(word) # Safety measure against empty strings in samples bulk = [word for word in bulk if word != ""] bulk = [ bulk[i:i + sample_size] for i in range(0, len(bulk), sample_size) ] for index, sample in enumerate(bulk): if len(sample) == sample_size: sampled_authors.append(author) sampled_titles.append(test_dict[author] + "_{}".format(str(index + 1))) sampled_texts.append(" ".join(sample)) else: # Make short random samples and add to sampled texts # Remove punctuation in the meantime print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, \ "+".join(pooled_titles))) pooled_texts = " ".join(pooled_texts) pooled_texts = tokenizer.tokenize_sentences(pooled_texts) if len(pooled_texts) < 20: print( "-----| ERROR: please check if input texts have punctuation, \ tokenization returned only {} sentence(s) |-----".format( len(pooled_texts))) break for _ in range(1, n_samples + 1): random_sample = [] while len(" ".join(random_sample).split()) <= sample_size: random_sample.append(random.choice(pooled_texts)) for index, word in enumerate(random_sample): random_sample[index] = "".join( [char for char in word if char not in punctuation]) random_sample = " ".join(random_sample).split()[:sample_size] sampled_authors.append(author) sampled_titles.append('{}_{}'.format(pooled_titles[0], _)) sampled_texts.append(" ".join(random_sample)) return sampled_authors, sampled_titles, sampled_texts
def scrap_doc(): #scraping table regex = re.compile('[%s]' % re.escape(string.punctuation)) tokenizer_latin = TokenizeSentence('latin') directory="dataset/dbg" if not os.path.exists(directory): os.makedirs(directory) for i in range (1,9): url="http://sacred-texts.com/cla/jcsr/dbg"+str(i)+".htm" html = urllib.urlopen(url) soup = BeautifulSoup(html) #create text file target_e = open("dataset/dbg/dbg"+str(i)+"_eng.txt", 'w') target_l = open("dataset/dbg/dbg"+str(i)+"_lat.txt", 'w') #to remove <a></a> for tag in soup.find_all('a'): tag.replaceWith('') k=0 for tr in soup.find_all('tr')[0:]: k=k+1 tds = tr.find_all('td') col1=tds[0].text col2=tds[1].text col1_tok=tokenize.sent_tokenize(col1) #col2_tok=tokenize.sent_tokenize(col2) col2_tok=tokenizer_latin.tokenize_sentences(col2) no_sentences_eng=0 #writing sentences to a file for l in range(len(col1_tok)): line=col1_tok[l] #line=regex.sub('', line).strip() if line!="": #line+='.' target_e.write((line.lower()).encode('utf-8')) target_e.write("\n") no_sentences_eng+=1 no_sentences_lat=0 for l in range(len(col2_tok)): line=col2_tok[l] #line=regex.sub('', line).strip() if line!="": #line+='.' target_l.write((line.lower()).encode('utf-8')) target_l.write("\n") no_sentences_lat+=1 if no_sentences_eng!=no_sentences_lat: print ("wrong ",i,k," :",(no_sentences_eng) ,(no_sentences_lat))