Esempio n. 1
0
    def _tokenize(self, text):
        """
        Use NLTK's standard tokenizer, rm punctuation.
        :param text: pre-processed text
        :return: tokenized text
        :rtype : list
        """
        sentence_tokenizer = TokenizeSentence('latin')
        sentences = sentence_tokenizer.tokenize_sentences(text.lower())

        sent_words = []
        punkt = PunktLanguageVars()
        for sentence in sentences:
            words = punkt.word_tokenize(sentence)

            assert isinstance(words, list)
            words_new = []
            for word in words:
                if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations:  # pylint: disable=line-too-long
                    words_new.append(word)

            # rm all numbers here with: re.compose(r'[09]')
            sent_words.append(words_new)

        return sent_words
Esempio n. 2
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum."  # pylint: disable=line-too-long
     good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(sentences)
     self.assertEqual(tokenized_sentences, good_tokenized_sentences)
Esempio n. 3
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
     target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(text)
     self.assertEqual(tokenized_sentences, target)
Esempio n. 4
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum."  # pylint: disable=line-too-long
     good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(sentences)
     self.assertEqual(tokenized_sentences, good_tokenized_sentences)
Esempio n. 5
0
    def _tokenize(self, text):
        """
        Use NLTK's standard tokenizer, rm punctuation.
        :param text: pre-processed text
        :return: tokenized text
        :rtype : list
        """
        sentence_tokenizer = TokenizeSentence('latin')
        sentences = sentence_tokenizer.tokenize_sentences(text.lower())

        sent_words = []
        punkt = PunktLanguageVars()
        for sentence in sentences:
            words = punkt.word_tokenize(sentence)

            assert isinstance(words, list)
            words_new = []
            for word in words:
                if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations:  # pylint: disable=line-too-long
                    words_new.append(word)

            # rm all numbers here with: re.compose(r'[09]')
            sent_words.append(words_new)

        return sent_words
Esempio n. 6
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
     target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(text)
     self.assertEqual(tokenized_sentences, target)
Esempio n. 7
0
    def compare_sentences(self, str_a, str_b, language):
        """Tokenize two input strings on sentence boundary and return a
        matrix of Levenshtein distance ratios.
        :param language: str (language name)
        :param string_a: str
        :param string_b: str
        :return: list [[Comparison]]
        """

        sents_a = []
        sents_b = []
        ratios = []

        # Make the latin tokenizer
        if language == "latin":
            sent_tokenizer = TokenizeSentence('latin')

        # Make the greek tokenizer
        elif language == "greek":
            sent_tokenizer = TokenizeSentence('greek')

        # Otherwise, if language, is unsupported, throw error stating accepted Language
        # values that may be used to tokenize sentences
        else:
            print("Language for sentence tokenization not recognized. "
                  "Accepted values are 'latin' and 'greek'.")
            return

        # If class instance is set to stem words, do so
        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        # Tokenize input strings
        sents_a = sent_tokenizer.tokenize_sentences(str_a)
        sents_b = sent_tokenizer.tokenize_sentences(str_b)

        # Process sentences for comparison (taking into account sanitization settings)
        sents_a = self._process_sentences(sents_a)
        sents_b = self._process_sentences(sents_b)

        # Build matrix of edit distance ratios
        comparisons = self._calculate_ratios(sents_a, sents_b)

        return comparisons
Esempio n. 8
0
class Tokenizer(object):
    def __init__(self):
        corpus_importer = CorpusImporter('greek')
        corpus_importer.import_corpus('greek_models_cltk')
        self.tokenizer = TokenizeSentence('greek')

    def calc_word_freq(self, data):
        word_dict = {}
        freq_dict = {}
        words = data.split()
        total_word = 0
        for word in words:
            if word in STOPS_LIST:
                continue
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
            total_word += 1
        for key in word_dict.keys():
            freq_dict[key] = word_dict[key] / float(total_word)
        return freq_dict

    def tokenize_sentence(self, data):
        sentence_dict = {}
        sentences = self.tokenizer.tokenize_sentences(data)
        word_frequency = 0
        freq_dict = self.calc_word_freq(data)
        for i, sentence in enumerate(sentences):
            words = sentence.split()
            for word in words:
                if word in STOPS_LIST:
                    continue
                word_frequency += freq_dict[
                    word] if word in freq_dict else 0.00000000000000000001
            len_words = len(words)
            calc = word_frequency / len_words
            sentence_dict[sentence] = ((calc, len_words), i)
        return sentence_dict
Esempio n. 9
0
import nltk
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from collections import Counter
from IPython.display import Image
from cltk.stop.latin import STOPS_LIST

# See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization

cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit."
cato_agri_praef_lowered = cato_agri_praef.lower()
# create a tokenizer instance of the TokenizeSentence Class
latin_sentence_tokenizer = TokenizeSentence('latin')

#tokenize the text into sentence tokens
cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences(
    cato_agri_praef)

# tokenize the text (or specific sentences) into specific words
latin_word_tokenizer = WordTokenizer('latin')
cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered)
cato_word_tokens_WO_punt = [
    token for token in cato_word_tokens if token not in ['.', ',', ':', ';']
]

#print the tokens and the number of tokens
num_of_sentences = len(cato_sentence_tokens)
num_of_words = len(cato_word_tokens_WO_punt)
#print("There are " + str(num_of_sentences) + " sentences in the text")
#print("There are " + str(num_of_words) + " words in the text")
# for sentence in cato_sentence_tokens:
#     print(sentence)
Esempio n. 10
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Esempio n. 11
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = nltk_tokenize_words

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence,
                                    rm_punctuation=True,
                                    rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [
                    w[1:] if w.startswith('-') else w for w in sentence
                ]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Esempio n. 12
0
    context = remove_extra_white_spaces(context)
    data_list.append(context)

    contextFile = open(contextFilePath, 'a+')
    questionFile = open(questionFilePath, 'a+')
    answerFile = open(answerFilePath, 'a+')
    answerSentenceFile = open(answerSentenceFilePath, 'a+')

    for paragraph in eachData["paragraphs"]:
        for qa in paragraph["qas"]:
            # Append to the three files
            # Answers file -- qa["answers"][0]["text"] -- choosing the first answer
            answer = qa["answers"][0]["text"]
            # Find the answer in the context
            foundFlag = False
            for sentence in tokenizer.tokenize_sentences(context):
                if sentence.find(answer) != -1:
                    # Answer found
                    foundFlag = True
                    answerSentenceFile.write(sentence)
                    answerSentenceFile.write('\n')
                    break

            if not foundFlag:
                # The answer is not found
                if context.find(answer) != -1:
                    print("FOUND IN CONTEXT")
                else:
                    print("NOT FOUND")
                continue
Esempio n. 13
0
# -*- coding: utf-8 -*-

#from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.sentence import TokenizeSentence
import re
from os import listdir
from os.path import isfile, join
textfiles = [f for f in listdir('.') if f.endswith('.txt')]

#word_tokenizer = WordTokenizer('greek')
tokenizer = TokenizeSentence('greek')

for file in textfiles:
    print(file)

    infile = open(file)
    text = infile.read()
    infile.close()

    for sent in tokenizer.tokenize_sentences(text):
        if re.search('παραδρ',sent):
            print(sent)
Esempio n. 14
0
def randomizer(authors, titles, texts, sample_size, 
			   test_dict, n_samples, smooth_test):

	""" |--- Function for randomly sampling from texts ---|
		::: Authors, Titles, Texts ::: """
	sampled_authors = []
	sampled_titles = []
	sampled_texts = []

	# Make train-test dict
	# Texts under the same author name are collected in one pool and then randomized
	pooled_dict = {author: [] for author in authors}
	for author, title, text in zip(authors, titles, texts):
		if author in pooled_dict:
			pooled_dict[author].append((title, text))

	# Instantiate cltk Tokenizer
	tokenizer = TokenizeSentence('latin')

	for author in pooled_dict:
		# Pool together texts by same author
		pooled_titles = [tup[0] for tup in pooled_dict[author]]
		pooled_texts = [tup[1] for tup in pooled_dict[author]]

		if author in test_dict and test_dict[author] in pooled_titles and smooth_test == False:
			print("::: test set «{} {}» is sampled in ordinary slices :::".format(author, "+".join(pooled_titles)))
			bulk = []
			for ord_text in pooled_texts:
				for word in ord_text.strip().split():
					word = word.lower()
					word = "".join([char for char in word if char not in punctuation])
					word = word.lower()
					bulk.append(word)
				# Safety measure against empty strings in samples
				bulk = [word for word in bulk if word != ""]
				bulk = [bulk[i:i+sample_size] for i in range(0, len(bulk), sample_size)]
				for index, sample in enumerate(bulk):
					if len(sample) == sample_size: 
						sampled_authors.append(author)
						sampled_titles.append(test_dict[author] + "_{}".format(str(index + 1)))
						sampled_texts.append(" ".join(sample))

		else:
			# Make short random samples and add to sampled texts
			# Remove punctuation in the meantime
			print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, "+".join(pooled_titles)))
			pooled_texts = " ".join(pooled_texts)
			pooled_texts = tokenizer.tokenize_sentences(pooled_texts)
			if len(pooled_texts) < 20:
				print("-----| ERROR: please check if input texts have punctuation, tokenization returned only {} sentence(s) |-----".format(len(pooled_texts)))
				break
			for _ in range(1, n_samples+1):
				random_sample = []
				while len(" ".join(random_sample).split()) <= sample_size:
					random_sample.append(random.choice(pooled_texts))
				for index, word in enumerate(random_sample):
					random_sample[index] = "".join([char for char in word if char not in punctuation])
				random_sample = " ".join(random_sample).split()[:sample_size]
				sampled_authors.append(author)
				sampled_titles.append('sample_{}'.format(_))
				sampled_texts.append(" ".join(random_sample))

	return sampled_authors, sampled_titles, sampled_texts
Esempio n. 15
0
def randomizer(authors, titles, texts, sample_size, test_dict, n_samples,
               smooth_test):
    """ 
	Function for making random samples from texts.
	Random samples are composed by combining randomly selected sentences.
	"""

    sampled_authors = []
    sampled_titles = []
    sampled_texts = []

    # Make train-test dict
    # Texts under the same author name are collected in one pool and then randomized
    pooled_dict = {author: [] for author in authors}
    for author, title, text in zip(authors, titles, texts):
        if author in pooled_dict:
            pooled_dict[author].append((title, text))

    # Instantiate cltk Tokenizer
    tokenizer = TokenizeSentence('latin')

    for author in pooled_dict:
        # Pool together texts by same author
        pooled_titles = [tup[0] for tup in pooled_dict[author]]
        pooled_texts = [tup[1] for tup in pooled_dict[author]]

        if author in test_dict and test_dict[
                author] in pooled_titles and smooth_test == False:
            print("::: test set «{} {}» is sampled in ordinary slices :::".
                  format(author, "+".join(pooled_titles)))
            bulk = []
            for ord_text in pooled_texts:
                for word in ord_text.strip().split():
                    word = word.lower()
                    word = "".join(
                        [char for char in word if char not in punctuation])
                    word = word.lower()
                    bulk.append(word)
                # Safety measure against empty strings in samples
                bulk = [word for word in bulk if word != ""]
                bulk = [
                    bulk[i:i + sample_size]
                    for i in range(0, len(bulk), sample_size)
                ]
                for index, sample in enumerate(bulk):
                    if len(sample) == sample_size:
                        sampled_authors.append(author)
                        sampled_titles.append(test_dict[author] +
                                              "_{}".format(str(index + 1)))
                        sampled_texts.append(" ".join(sample))

        else:
            # Make short random samples and add to sampled texts
            # Remove punctuation in the meantime
            print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, \
               "+".join(pooled_titles)))
            pooled_texts = " ".join(pooled_texts)
            pooled_texts = tokenizer.tokenize_sentences(pooled_texts)
            if len(pooled_texts) < 20:
                print(
                    "-----| ERROR: please check if input texts have punctuation, \
					   tokenization returned only {} sentence(s) |-----".format(
                        len(pooled_texts)))
                break
            for _ in range(1, n_samples + 1):
                random_sample = []
                while len(" ".join(random_sample).split()) <= sample_size:
                    random_sample.append(random.choice(pooled_texts))
                for index, word in enumerate(random_sample):
                    random_sample[index] = "".join(
                        [char for char in word if char not in punctuation])
                random_sample = " ".join(random_sample).split()[:sample_size]
                sampled_authors.append(author)
                sampled_titles.append('{}_{}'.format(pooled_titles[0], _))
                sampled_texts.append(" ".join(random_sample))

    return sampled_authors, sampled_titles, sampled_texts
def scrap_doc():
	#scraping table
	regex = re.compile('[%s]' % re.escape(string.punctuation))
	
	tokenizer_latin = TokenizeSentence('latin')	
	directory="dataset/dbg"
	if not os.path.exists(directory):
			os.makedirs(directory)

	for i in range (1,9):
		url="http://sacred-texts.com/cla/jcsr/dbg"+str(i)+".htm"
		
		html = urllib.urlopen(url)
		soup = BeautifulSoup(html)

		
		#create text file
		target_e = open("dataset/dbg/dbg"+str(i)+"_eng.txt", 'w')
		target_l = open("dataset/dbg/dbg"+str(i)+"_lat.txt", 'w')

		#to remove <a></a>
		for tag in soup.find_all('a'):
			tag.replaceWith('')
		
		k=0
		for tr in soup.find_all('tr')[0:]:
			k=k+1
			tds = tr.find_all('td')
			col1=tds[0].text
			col2=tds[1].text
	
			col1_tok=tokenize.sent_tokenize(col1)
			#col2_tok=tokenize.sent_tokenize(col2)
			
			col2_tok=tokenizer_latin.tokenize_sentences(col2)

			no_sentences_eng=0
			#writing sentences to a file
			for l in range(len(col1_tok)):
				line=col1_tok[l]
				#line=regex.sub('', line).strip()
			
			
				if line!="":
					#line+='.'
					target_e.write((line.lower()).encode('utf-8'))
					target_e.write("\n")
					no_sentences_eng+=1
			
			no_sentences_lat=0
			for l in range(len(col2_tok)):
				line=col2_tok[l]
				#line=regex.sub('', line).strip()
			
			
				if line!="":
					#line+='.'
					target_l.write((line.lower()).encode('utf-8'))
					target_l.write("\n")
					no_sentences_lat+=1
			
			if no_sentences_eng!=no_sentences_lat:
				print ("wrong ",i,k," :",(no_sentences_eng)	,(no_sentences_lat))