Beispiel #1
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
     target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #2
0
 def test_sentence_tokenizer_marathi(self):
     """Test tokenizing marathi sentences."""
     text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥"
     target = ['अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये', 'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं', 'तेषां', 'के', 'योगवित्तमाः', '॥']
     tokenizer = TokenizeSentence('marathi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #3
0
    def _tokenize(self, text):
        """
        Use NLTK's standard tokenizer, rm punctuation.
        :param text: pre-processed text
        :return: tokenized text
        :rtype : list
        """
        sentence_tokenizer = TokenizeSentence('latin')
        sentences = sentence_tokenizer.tokenize_sentences(text.lower())

        sent_words = []
        punkt = PunktLanguageVars()
        for sentence in sentences:
            words = punkt.word_tokenize(sentence)

            assert isinstance(words, list)
            words_new = []
            for word in words:
                if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations:  # pylint: disable=line-too-long
                    words_new.append(word)

            # rm all numbers here with: re.compose(r'[09]')
            sent_words.append(words_new)

        return sent_words
Beispiel #4
0
    def _tokenize(self, text):
        """
        Use NLTK's standard tokenizer, rm punctuation.
        :param text: pre-processed text
        :return: tokenized text
        :rtype : list
        """
        sentence_tokenizer = TokenizeSentence('latin')
        sentences = sentence_tokenizer.tokenize_sentences(text.lower())

        sent_words = []
        punkt = PunktLanguageVars()
        for sentence in sentences:
            words = punkt.word_tokenize(sentence)

            assert isinstance(words, list)
            words_new = []
            for word in words:
                if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations:  # pylint: disable=line-too-long
                    words_new.append(word)

            # rm all numbers here with: re.compose(r'[09]')
            sent_words.append(words_new)

        return sent_words
Beispiel #5
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum."  # pylint: disable=line-too-long
     good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(sentences)
     self.assertEqual(tokenized_sentences, good_tokenized_sentences)
Beispiel #6
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     sentences = "Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti. Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum."  # pylint: disable=line-too-long
     good_tokenized_sentences = ['Itaque cum M. Aurelio et P. Minidio et Cn. Cornelio ad apparationem balistarum et scorpionem reliquorumque tormentorum refectionem fui praesto et cum eis commoda accepi, quae cum primo mihi tribuisiti recognitionem, per sorosis commendationem servasti.', 'Cum ergo eo beneficio essem obligatus, ut ad exitum vitae non haberem inopiae timorem, haec tibi scribere coepi, quod animadverti multa te aedificavisse et nunc aedificare, reliquo quoque tempore et publicorum et privatorum aedificiorum, pro amplitudine rerum gestarum ut posteris memoriae traderentur curam habiturum.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(sentences)
     self.assertEqual(tokenized_sentences, good_tokenized_sentences)
Beispiel #7
0
 def test_sentence_tokenizer_latin(self):
     """Test tokenizing Latin sentences."""
     text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem."  # pylint: disable=line-too-long
     target = ['O di inmortales!', 'ubinam gentium sumus?', 'in qua urbe vivimus?', 'quam rem publicam habemus?', 'Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent!', 'Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero!', 'Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem.']  # pylint: disable=line-too-long
     tokenizer = TokenizeSentence('latin')
     tokenized_sentences = tokenizer.tokenize_sentences(text)
     self.assertEqual(tokenized_sentences, target)
def cleaning_data(str):
    tokenizer = TokenizeSentence('bengali')
    bengali_text_tokenize = tokenizer.tokenize(str)
    # print(bengali_text_tokenize)
    cleaned = clean(bengali_text_tokenize)
    cleaned = ' '.join(cleaned)
    return cleaned
Beispiel #9
0
 def test_sentence_tokenizer_sanskrit(self):
     """Test tokenizing sanskrit sentences."""
     text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।"
     target = ['श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ', 'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च', '।', '।']
     tokenizer = TokenizeSentence('sanskrit')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #10
0
 def test_sentence_tokenizer_telugu(self):
     """Test tokenizing telugu sentences."""
     text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు."
     target = ['తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను', 'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన', 'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు', 'ధనికుని', 'యింటికేతెంచుచుందురు', '.']
     tokenizer = TokenizeSentence('telugu')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #11
0
 def test_sentence_tokenizer_classical_hindi(self):
     """Test tokenizing classical_hindi sentences."""
     text = "जलर्  चिकित्सा से उन्हें कोई लाभ नहीं हुआ।"
     target = ['जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।']
     tokenizer = TokenizeSentence('hindi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #12
0
def tokenizing():
    matched_englist_list = []
    matched_hindi_list = []
    non_matched_englist_list = []
    non_matched_hindi_list = []
    global dat_frame_Matched
    global dat_frame_Not_Matched
    global final_df_matched
    global final_df_non_matched
    global count_sentence_positive
    global count_sentence_negative
    global total_section
    global count_section_average
    global total_section_mis_match_english
    global count_section_average_mis_match_english
    global total_section_mis_match_hindi
    global count_section_average_mis_match_hindi
    tokenizer = TokenizeSentence('hindi')
    for index, row in dat_frame_Matched.iterrows():
        #print(row['name'], row['age']):
        #print(row["Hindi"],row["English"])
        #row["Hindi"]=re.sub("\d+\.","",row["Hindi"])
        #row["English"]=re.sub("\d+\.","",row["English"])
        l1 = tokenizer.tokenize(row["Hindi"])
        l2 = sent_tokenize(row["English"])
        #print(len(l1)==len(l2))
        if len(l1) == len(l2):
            total_section = total_section + len(l2)
            count_section_average = count_section_average + 1
            count_sentence_positive = count_sentence_positive + len(l2)
            matched_englist_list.extend(l2)
            matched_hindi_list.extend(l1)
        else:
            total_section_mis_match_english = total_section_mis_match_english + len(
                l2)
            count_section_average_mis_match_english = count_section_average_mis_match_english + 1
            total_section_mis_match_hindi = total_section_mis_match_hindi + len(
                l1)
            count_section_average_mis_match_hindi = count_section_average_mis_match_hindi + 1
            #print(l1,l2)
            count_sentence_negative = count_sentence_negative + len(l1)
            non_matched_englist_list.append(row["English"])
            non_matched_hindi_list.append(row["Hindi"])
    for index, row in dat_frame_Not_Matched.iterrows():
        hind = ' '.join(map(str, row["Hindi"]))
        englsh = ' '.join(map(str, row["English"]))
        l1 = re.split("।", hind)
        l2 = sent_tokenize(englsh)
        if len(l1) == len(l2):
            count_sentence_positive = count_sentence_positive + len(l2)
            matched_englist_list.extend(l2)
            matched_hindi_list.extend(l1)
        else:
            c = count_sentence_negative + len(l2)
            non_matched_englist_list.append(englsh)
            non_matched_hindi_list.append(hind)
    final_df_matched['English'] = matched_englist_list
    final_df_matched['Hindi'] = matched_hindi_list
    final_df_non_matched['Hindi'] = non_matched_hindi_list
    final_df_non_matched['English'] = non_matched_englist_list
Beispiel #13
0
 def test_sentence_tokenizer_bengali(self):
     """Test tokenizing bengali sentences."""
     text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।"
     target = ['দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে', 'বেশ', 'সুখে', 'আছেন', '।']
     tokenizer = TokenizeSentence('bengali')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
def porter_tokenizer(text):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    tokenizer = TokenizeSentence('bengali')
    bengali_text_tokenize = tokenizer.tokenize(text)
    bengali_text_tokenize
    
    
    return bengali_text_tokenize
Beispiel #15
0
    def test_sentence_tokenizer_sanskrit(self):
        """Test tokenizing Sanskrit sentences."""
        text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।
न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।"""
        target = ['श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।','यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।', 'न मे विदुः सुरगणाः प्रभवं न महर्षयः।', 'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।']
        tokenizer = TokenizeSentence('sanskrit')
        tokenized_sentences = tokenizer.tokenize(text)
        self.assertEqual(tokenized_sentences, target)
def preprocess_doc(
    sent,
    params={
        'remove_numbers': False,
        'remove_emoji': True,
        'remove_stop_words': True,
        'tokenize': True
    }):
    '''This function should implememnt a multi-lingual tokenizer '''
    '''input: a document / sentence , params is a dict of control sequence'''
    '''output: should return a token list for the entire document/sentence'''

    sent = emoji.demojize(sent)
    sent = re.sub(r"http\S+", '', sent)
    sent = re.sub(r"www.\S+", '', sent)

    if (params['remove_numbers'] == True):
        sent = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", sent)
    sent = re.sub(r"/-", " ", sent)
    sent = re.sub(r"#,\,", " ", sent)
    tokenizer = TokenizeSentence('hindi')
    sents = tokenizer.tokenize(sent)
    all_sents = []

    for s in sents:
        if (params['remove_emoji'] == True):
            s = re.sub(r":\S+:", "", s)
        else:
            s = re.sub(r"[:\*]", "", s)

        punc = set(punctuation) - set('.')

        newtext = []
        for k, g in groupby(s):
            if k in punc:
                newtext.append(k)
            else:
                newtext.extend(g)

        s = ''.join(newtext)

        s = re.sub('[' + re.escape(''.join(puncts)) + ']', '', s)
        s = s.lower()
        if (params['tokenize'] == True):
            msg = tok.tokenize(s)
        else:
            msg = s

        if ((params['tokenize'] == True)
                and (params['remove_stop_words'] == True)):
            msg_filtered = [word for word in msg if word not in stop_for_this]
        else:
            msg_filtered = msg
        if (len(msg_filtered) > 0):
            all_sents.append(msg_filtered)

    return all_sents
Beispiel #17
0
 def test_sentence_tokenizer_bengali(self):
     """Test tokenizing bengali sentences."""
     text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।"
     target = [
         'দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে',
         'বেশ', 'সুখে', 'আছেন', '।'
     ]
     tokenizer = TokenizeSentence('bengali')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #18
0
 def test_sentence_tokenizer_classical_hindi(self):
     """Test tokenizing classical_hindi sentences."""
     text = "जलर्  चिकित्सा से उन्हें कोई लाभ नहीं हुआ।"
     target = [
         'जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ',
         '।'
     ]
     tokenizer = TokenizeSentence('hindi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #19
0
 def test_classical_hindi_stops(self):
     """
     Test filtering classical hindi stopwords
     Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt)
     """
     sentence = " वह काबुली फिर वहां आकर खडा हो गया है  "
     tokenizer = TokenizeSentence('hindi')
     tokens = tokenizer.tokenize(sentence)
     no_stops = [word for word in tokens if word not in HINDI_STOPS]
     target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया']
     self.assertEqual(no_stops, target_list)
Beispiel #20
0
 def test_classical_hindi_stops(self):
     """
     Test filtering classical hindi stopwords
     Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt)
     """
     sentence = " वह काबुली फिर वहां आकर खडा हो गया है  "
     tokenizer = TokenizeSentence('hindi')
     tokens = tokenizer.tokenize(sentence)
     no_stops = [word for word in tokens if word not in HINDI_STOPS]
     target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया']
     self.assertEqual(no_stops, target_list)
Beispiel #21
0
 def test_sentence_tokenizer_sanskrit(self):
     """Test tokenizing sanskrit sentences."""
     text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।"
     target = [
         'श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ',
         'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च',
         '।', '।'
     ]
     tokenizer = TokenizeSentence('sanskrit')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #22
0
 def test_sentence_tokenizer_marathi(self):
     """Test tokenizing marathi sentences."""
     text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥"
     target = [
         'अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये',
         'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं',
         'तेषां', 'के', 'योगवित्तमाः', '॥'
     ]
     tokenizer = TokenizeSentence('marathi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #23
0
 def test_sentence_tokenizer_telugu(self):
     """Test tokenizing telugu sentences."""
     text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు."
     target = [
         'తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను',
         'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన',
         'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు',
         'ధనికుని', 'యింటికేతెంచుచుందురు', '.'
     ]
     tokenizer = TokenizeSentence('telugu')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
Beispiel #24
0
    def test_sentence_tokenizer_sanskrit(self):
        """Test tokenizing Sanskrit sentences."""
        text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।
न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।"""
        target = [
            'श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।',
            'यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।',
            'न मे विदुः सुरगणाः प्रभवं न महर्षयः।',
            'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।'
        ]
        tokenizer = TokenizeSentence('sanskrit')
        tokenized_sentences = tokenizer.tokenize(text)
        self.assertEqual(tokenized_sentences, target)
Beispiel #25
0
def createCorpus(text, save=True):
    '''
    :params text - the raw text

    returns  + the corpus, a list of list with tokenized sentences
             + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words.

    '''
    with open('../../data/stopwords.txt', 'r', encoding="UTF-8") as src:
        stopwords = src.read()

    stopwords = stopwords.split('\n')
    stopwords.extend([".", ",", "?", "!", "-", ":", ";", "·"])

    Stokenizer = TokenizeSentence('greek')
    Wtokenizer = WordTokenizer('greek')
    sentences = Stokenizer.tokenize(text)
    new_sentences = []
    vocab = dict()
    print('Building corpus and freqDictionary')
    for sent in tqdm(sentences, desc="Sentences"):
        new_sent = Wtokenizer.tokenize(sent)
        # Stopword deletion
        new_sent = [w for w in new_sent if w not in stopwords]
        new_sentences.append(new_sent)
        for w in new_sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    vocab_size = len(vocab)
    for k, v in vocab.items():
        # Subsampling, see paper by Goldberg & Levy
        frac = v / vocab_size
        p_w = (1 + np.sqrt(frac * 0.001)) * 0.001 / frac
        # update the value for the word
        vocab[k] = p_w
    if save:
        print('Saving the frequencies')
        with open('../../data/vocabularies/Homer_word_frequencies.json',
                  'w',
                  encoding='utf-8') as fp:
            json.dump(vocab, fp, ensure_ascii=False)
        print('Saving the corpus')
        arr = np.array(new_sentences, dtype=object)
        np.save('../../data/Homer_tokenized_corpus.npy', arr)
    return new_sentences, vocab
Beispiel #26
0
    def tokenize(self, mode='word'):
        """Tokenizes the passage into lists of words or sentences.

        Breaks text words into individual tokens (strings) by default. If
        mode is set to sentence, returns lists of sentences.

        Args:
            mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence'

        Returns:
            :obj:`list` of :obj:`str` Tokenized words (or sentences)

        Example:
            >>> LatinText('Gallia est omnis divisa in partes tres').tokenize()
            ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres']

        """
        from cltk.tokenize.word import nltk_tokenize_words
        from cltk.tokenize.sentence import TokenizeSentence
        if mode == 'sentence':
            return TokenizeSentence(
                self.options['language']
            ).tokenize_sentences(self.data)
        else:
            return nltk_tokenize_words(self.data)
Beispiel #27
0
 def sentence_tokenizer(self):
     hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize(
         self.sentence)
     # print(hindi_text_sentence_tokenize)
     print("\nHindi sentence tokenize")
     for i in hindi_text_sentence_tokenize:
         print(i)
Beispiel #28
0
class TextSummarization:
    sentence = ""
    hindi_stop_words = set(STOPS_LIST)
    sentence_tokenizer = TokenizeSentence('hindi')

    def __init__(self, hindi_text_input):
        self.sentence = hindi_text_input

    def word_tokenize(self):
        word_tokenizer = WordTokenizer('sanskrit')
        return word_tokenizer.tokenize(self.sentence)
        # print(hindi_text_words

    def sentence_tokenizer(self):
        hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize(
            self.sentence)
        # print(hindi_text_sentence_tokenize)
        print("\nHindi sentence tokenize")
        for i in hindi_text_sentence_tokenize:
            print(i)

    def get_stop_words(self):
        return self.hindi_stop_words

    def print_stop_words(self):
        print("Stop words:", self.get_stop_words()[:10])

    def get_filtered_sentence(self):
        filtered_sentence = []
        for w in self.word_tokenize():
            if w not in self.get_stop_words():
                filtered_sentence = filtered_sentence + w.split()

        print("filtered sentence:", filtered_sentence)
Beispiel #29
0
def get_corpus_reader(corpus_name: str = None,
                      language: str = None) -> CorpusReader:
    """
    Corpus reader factory method
    :param corpus_name: the name of the supported corpus, available as: [package].SUPPORTED_CORPORA
    :param langugage: the language for search in
    :return: NLTK compatible corpus reader
    """
    BASE = '~/cltk_data/{}/text'.format(language)
    root = os.path.join(os.path.expanduser(BASE), corpus_name)

    if not os.path.exists(root) or corpus_name not in SUPPORTED_CORPORA.get(
            language):
        raise ValueError(
            'Specified corpus data not found, please install {} for language: {}'
            .format(corpus_name, language))

    sentence_tokenizer = TokenizeSentence(language)
    the_word_tokenizer = WordTokenizer(language)

    DOC_PATTERN = r'.*\.txt'  #: Generic file ending, override below in your own CorpusReader implementation

    if language == 'latin':
        if corpus_name == 'latin_text_latin_library':
            skip_keywords = ['Latin', 'Library']
            return FilteredPlaintextCorpusReader(
                root=root,
                fileids=DOC_PATTERN,
                sent_tokenizer=sentence_tokenizer,
                word_tokenizer=the_word_tokenizer,
                skip_keywords=skip_keywords)
        if corpus_name == 'latin_text_perseus':
            pass
Beispiel #30
0
def get_corpus_reader(corpus_name: str = None,
                      language: str = None) -> CorpusReader:
    """
    Corpus reader factory method
    :param corpus_name: the name of the supported corpus, available as: [package].SUPPORTED_CORPORA
    :param langugage: the language for search in
    :return: NLTK compatible corpus reader
    """
    BASE = '~/cltk_data/{}/text'.format(language)
    root = os.path.join(os.path.expanduser(BASE), corpus_name)

    if not os.path.exists(root) or corpus_name not in SUPPORTED_CORPORA.get(
            language):
        raise ValueError(
            'Specified corpus data not found, please install {} for language: {}'
            .format(corpus_name, language))

    sentence_tokenizer = TokenizeSentence(language)
    the_word_tokenizer = WordTokenizer(language)
    doc_pattern = r'.*\.txt'  #: Generic file ending, override below in your own CorpusReader implementation

    if language == 'latin':
        if corpus_name == 'latin_text_latin_library':
            skip_keywords = ['Latin', 'Library']
            return FilteredPlaintextCorpusReader(
                root=root,
                fileids=doc_pattern,
                sent_tokenizer=sentence_tokenizer,
                word_tokenizer=the_word_tokenizer,
                skip_keywords=skip_keywords)
        if corpus_name == 'latin_text_perseus':
            valid_json_root = os.path.join(
                root, 'cltk_json')  #: we only support this subsection
            return JsonfileCorpusReader(
                root=valid_json_root,
                sent_tokenizer=sentence_tokenizer,
                word_tokenizer=the_word_tokenizer,
                target_language='latin')  # perseus also contains English

    if language == 'greek':
        if corpus_name == 'greek_text_perseus':
            valid_json_root = os.path.join(
                root, 'cltk_json')  #: we only support this subsection
            return JsonfileCorpusReader(
                root=valid_json_root,
                sent_tokenizer=sentence_tokenizer,
                word_tokenizer=the_word_tokenizer,
                target_language='grc')  #: this abbreviation is required

        if corpus_name == 'greek_text_tesserae':
            # tokenizers/taggers need to be replaced with CLTK version
            # most obv. for POS tagging!
            return TesseraeCorpusReader(
                root=root,
                fileids=r'.*\.tess',
                sent_tokenizer=sent_tokenize,
                word_tokenizer=word_tokenize,
                pos_tagger=pos_tag,
                target_language='grc')  #: this abbreviation is required
Beispiel #31
0
    def compare_sentences(self, str_a, str_b, language):
        """Tokenize two input strings on sentence boundary and return a
        matrix of Levenshtein distance ratios.
        :param language: str (language name)
        :param string_a: str
        :param string_b: str
        :return: list [[Comparison]]
        """

        sents_a = []
        sents_b = []
        ratios = []

        # Make the latin tokenizer
        if language == "latin":
            sent_tokenizer = TokenizeSentence('latin')

        # Make the greek tokenizer
        elif language == "greek":
            sent_tokenizer = TokenizeSentence('greek')

        # Otherwise, if language, is unsupported, throw error stating accepted Language
        # values that may be used to tokenize sentences
        else:
            print("Language for sentence tokenization not recognized. "
                  "Accepted values are 'latin' and 'greek'.")
            return

        # If class instance is set to stem words, do so
        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        # Tokenize input strings
        sents_a = sent_tokenizer.tokenize_sentences(str_a)
        sents_b = sent_tokenizer.tokenize_sentences(str_b)

        # Process sentences for comparison (taking into account sanitization settings)
        sents_a = self._process_sentences(sents_a)
        sents_b = self._process_sentences(sents_b)

        # Build matrix of edit distance ratios
        comparisons = self._calculate_ratios(sents_a, sents_b)

        return comparisons
Beispiel #32
0
def bangla_tokenize(text):
    """Gets the spreadsheet's header column named 'bengali_version' and toeknize each text based on that particular grammar"

                        Parameters
                        ----------
                        text : str
                            The texts retrieved from the spreadsheet

                        Returns
                        -------
                        list
                            a list of tokens
                        """
    x = []
    for line in text:
        tokenizer = TokenizeSentence('bengali')
        bengali_text_tokenize = tokenizer.tokenize(line)
        x.insert(0, bengali_text_tokenize)
    return x[::-1]
def tokenizing(hindi,english):
    matched_englist_list=[]
    matched_hindi_list=[]
    non_matched_englist_list=[]
    non_matched_hindi_list=[]
    global dat_frame_Matched
    global dat_frame_Not_Matched
    global final_df_matched
    global final_df_non_matched
    global count_sentence_positive
    global count_sentence_negative
    global total_section
    global count_section_average
    global total_section_mis_match_english
    global count_section_average_mis_match_english
    global total_section_mis_match_hindi
    global count_section_average_mis_match_hindi
    tokenizer = TokenizeSentence('hindi')
    for i in range(len(hindi)): 
        
        l1=tokenizer.tokenize(hindi[i])
        l2=sent_tokenize(english[i])
        #print(len(l1)==len(l2))
        if len(l1)==len(l2):
            total_section=total_section+len(l2)
            count_section_average=count_section_average+1
            count_sentence_positive =count_sentence_positive+len(l2)
            matched_englist_list.extend(l2)
            matched_hindi_list.extend(l1)
        else:
            total_section_mis_match_english=total_section_mis_match_english+len(l2)
            count_section_average_mis_match_english=count_section_average_mis_match_english+1
            total_section_mis_match_hindi=total_section_mis_match_hindi+len(l1)
            count_section_average_mis_match_hindi=count_section_average_mis_match_hindi+1
            #print(l1,l2)
            count_sentence_negative=count_sentence_negative+len(l1)
            non_matched_englist_list.append(english[i])
            non_matched_hindi_list.append(hindi[i])
    final_df_matched['English']=matched_englist_list
    final_df_matched['Hindi']=matched_hindi_list
Beispiel #34
0
class Tokenizer(object):
    def __init__(self):
        corpus_importer = CorpusImporter('greek')
        corpus_importer.import_corpus('greek_models_cltk')
        self.tokenizer = TokenizeSentence('greek')

    def calc_word_freq(self, data):
        word_dict = {}
        freq_dict = {}
        words = data.split()
        total_word = 0
        for word in words:
            if word in STOPS_LIST:
                continue
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1
            total_word += 1
        for key in word_dict.keys():
            freq_dict[key] = word_dict[key] / float(total_word)
        return freq_dict

    def tokenize_sentence(self, data):
        sentence_dict = {}
        sentences = self.tokenizer.tokenize_sentences(data)
        word_frequency = 0
        freq_dict = self.calc_word_freq(data)
        for i, sentence in enumerate(sentences):
            words = sentence.split()
            for word in words:
                if word in STOPS_LIST:
                    continue
                word_frequency += freq_dict[
                    word] if word in freq_dict else 0.00000000000000000001
            len_words = len(words)
            calc = word_frequency / len_words
            sentence_dict[sentence] = ((calc, len_words), i)
        return sentence_dict
Beispiel #35
0
import nltk
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
from collections import Counter
from IPython.display import Image
from cltk.stop.latin import STOPS_LIST

# See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization

cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit."
cato_agri_praef_lowered = cato_agri_praef.lower()
# create a tokenizer instance of the TokenizeSentence Class
latin_sentence_tokenizer = TokenizeSentence('latin')

#tokenize the text into sentence tokens
cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences(
    cato_agri_praef)

# tokenize the text (or specific sentences) into specific words
latin_word_tokenizer = WordTokenizer('latin')
cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered)
cato_word_tokens_WO_punt = [
    token for token in cato_word_tokens if token not in ['.', ',', ':', ';']
]

#print the tokens and the number of tokens
num_of_sentences = len(cato_sentence_tokens)
num_of_words = len(cato_word_tokens_WO_punt)
#print("There are " + str(num_of_sentences) + " sentences in the text")
#print("There are " + str(num_of_words) + " words in the text")
# for sentence in cato_sentence_tokens:
Beispiel #36
0
def sentence_tokenizer(text):
    text.replace(".", " | ")
    text.replace("\n", "").strip()
    print("Sentence Tokenizer triggered")
    hindi_text_sentence_tokenize = TokenizeSentence('hindi').tokenize(text)
    return hindi_text_sentence_tokenize
Beispiel #37
0
#nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from openpyxl.workbook import Workbook
from selenium import webdriver
import time
import pickle
import random
dir0='F:/hindi_english_downloaded_split/epub/english_corpora'
os.chdir(dir0)
total_file=0
list_dir0=os.listdir()
file_low_list=[]
English_j=[]
Hindi_j=[]
pos_sec=0
tokenizer = TokenizeSentence('hindi')
total=0
#trans_df=pd.read_excel('F:/hindi_english_downloaded_split/Hindi_all_sentences_trans.xlsx')
for io in range(len(list_dir0)):
    dir1=dir0+'/'+str(list_dir0[io])
    os.chdir(dir1)
    list_dir=os.listdir()
    for j in range(len(list_dir)):
        English_1=[]
        Hindi_1=[]
        dir2=dir1+'/'+str(list_dir[j])
        os.chdir(dir2)
        df_section=pd.read_excel('paragraph_section_to_section_combined.xlsx')
        df_sentence=pd.read_excel('train_sentence_level.xlsx')
        for i in range(len(df_section)):
            s=str(df_section["Hindi"][i])
Beispiel #38
0
def tokenizing():
    matched_englist_list = []
    matched_hindi_list = []
    non_matched_englist_list = []
    non_matched_hindi_list = []
    global dat_frame_Matched
    global dat_frame_Not_Matched
    global final_df_matched
    global final_df_non_matched
    global count_sentence_positive
    global count_sentence_negative
    global total_section
    global count_section_average
    global total_section_mis_match_english
    global count_section_average_mis_match_english
    global total_section_mis_match_hindi
    global count_section_average_mis_match_hindi
    tokenizer = TokenizeSentence('hindi')
    for index, row in dat_frame_Matched.iterrows():
        #print(row['name'], row['age']):
        #print(row["Hindi"],row["English"])
        #row["Hindi"]=re.sub("\d+\.","",row["Hindi"])
        #row["English"]=re.sub("\d+\.","",row["English"])
        l1 = tokenizer.tokenize(row["Hindi"])
        l2 = sent_tokenize(row["English"])
        #print(len(l1)==len(l2))
        if len(l1) == len(l2):
            total_section = total_section + len(l2)
            count_section_average = count_section_average + 1
            count_sentence_positive = count_sentence_positive + len(l2)
            matched_englist_list.extend(l2)
            matched_hindi_list.extend(l1)
        else:
            total_section_mis_match_english = total_section_mis_match_english + len(
                l2)
            count_section_average_mis_match_english = count_section_average_mis_match_english + 1
            total_section_mis_match_hindi = total_section_mis_match_hindi + len(
                l1)
            count_section_average_mis_match_hindi = count_section_average_mis_match_hindi + 1
            #print(l1,l2)
            count_sentence_negative = count_sentence_negative + len(l1)
            """print(len(l1),len(l2))
            for j in range(min(len(l1),len(l2))):
                matched_englist_list.append(l2[j])
                matched_hindi_list.append(l1[j])"""
            non_matched_englist_list.append(row["English"])
            non_matched_hindi_list.append(row["Hindi"])
    for index, row in dat_frame_Not_Matched.iterrows():
        hind = ' '.join(map(str, row["Hindi"]))
        englsh = ' '.join(map(str, row["English"]))
        l1 = re.split("।", hind)
        l2 = sent_tokenize(englsh)
        if len(l1) == len(l2):
            count_sentence_positive = count_sentence_positive + len(l2)
            matched_englist_list.extend(l2)
            matched_hindi_list.extend(l1)
        else:
            c = count_sentence_negative + len(l2)
            """print(len(l1),len(l2))
            for j in range(min(len(l1),len(l2))):
                matched_englist_list.append(l2[j])
                matched_hindi_list.append(l1[j])"""
            non_matched_englist_list.append(englsh)
            non_matched_hindi_list.append(hind)
    final_df_matched['English'] = matched_englist_list
    final_df_matched['Hindi'] = matched_hindi_list
    final_df_non_matched['Hindi'] = non_matched_hindi_list
    final_df_non_matched['English'] = non_matched_englist_list
    translator = Translator()
    translated_english = []
    try:
        translations = translator.translate(matched_hindi_list, dest='en')
        for translation in translations:
            try:
                translated_english.append(translation.text)
            except:
                translated_english.append("None")
    except:
        for translation in matched_hindi_list:
            try:
                translated_english.append(translation.text)
            except:
                translated_english.append("None")
    final_df_matched['Translated'] = translated_english
def scrap_doc():
	#scraping table
	regex = re.compile('[%s]' % re.escape(string.punctuation))
	
	tokenizer_latin = TokenizeSentence('latin')	
	directory="dataset/dbg"
	if not os.path.exists(directory):
			os.makedirs(directory)

	for i in range (1,9):
		url="http://sacred-texts.com/cla/jcsr/dbg"+str(i)+".htm"
		
		html = urllib.urlopen(url)
		soup = BeautifulSoup(html)

		
		#create text file
		target_e = open("dataset/dbg/dbg"+str(i)+"_eng.txt", 'w')
		target_l = open("dataset/dbg/dbg"+str(i)+"_lat.txt", 'w')

		#to remove <a></a>
		for tag in soup.find_all('a'):
			tag.replaceWith('')
		
		k=0
		for tr in soup.find_all('tr')[0:]:
			k=k+1
			tds = tr.find_all('td')
			col1=tds[0].text
			col2=tds[1].text
	
			col1_tok=tokenize.sent_tokenize(col1)
			#col2_tok=tokenize.sent_tokenize(col2)
			
			col2_tok=tokenizer_latin.tokenize_sentences(col2)

			no_sentences_eng=0
			#writing sentences to a file
			for l in range(len(col1_tok)):
				line=col1_tok[l]
				#line=regex.sub('', line).strip()
			
			
				if line!="":
					#line+='.'
					target_e.write((line.lower()).encode('utf-8'))
					target_e.write("\n")
					no_sentences_eng+=1
			
			no_sentences_lat=0
			for l in range(len(col2_tok)):
				line=col2_tok[l]
				#line=regex.sub('', line).strip()
			
			
				if line!="":
					#line+='.'
					target_l.write((line.lower()).encode('utf-8'))
					target_l.write("\n")
					no_sentences_lat+=1
			
			if no_sentences_eng!=no_sentences_lat:
				print ("wrong ",i,k," :",(no_sentences_eng)	,(no_sentences_lat)) 	
Beispiel #40
0
def randomizer(authors, titles, texts, sample_size, 
			   test_dict, n_samples, smooth_test):

	""" |--- Function for randomly sampling from texts ---|
		::: Authors, Titles, Texts ::: """
	sampled_authors = []
	sampled_titles = []
	sampled_texts = []

	# Make train-test dict
	# Texts under the same author name are collected in one pool and then randomized
	pooled_dict = {author: [] for author in authors}
	for author, title, text in zip(authors, titles, texts):
		if author in pooled_dict:
			pooled_dict[author].append((title, text))

	# Instantiate cltk Tokenizer
	tokenizer = TokenizeSentence('latin')

	for author in pooled_dict:
		# Pool together texts by same author
		pooled_titles = [tup[0] for tup in pooled_dict[author]]
		pooled_texts = [tup[1] for tup in pooled_dict[author]]

		if author in test_dict and test_dict[author] in pooled_titles and smooth_test == False:
			print("::: test set «{} {}» is sampled in ordinary slices :::".format(author, "+".join(pooled_titles)))
			bulk = []
			for ord_text in pooled_texts:
				for word in ord_text.strip().split():
					word = word.lower()
					word = "".join([char for char in word if char not in punctuation])
					word = word.lower()
					bulk.append(word)
				# Safety measure against empty strings in samples
				bulk = [word for word in bulk if word != ""]
				bulk = [bulk[i:i+sample_size] for i in range(0, len(bulk), sample_size)]
				for index, sample in enumerate(bulk):
					if len(sample) == sample_size: 
						sampled_authors.append(author)
						sampled_titles.append(test_dict[author] + "_{}".format(str(index + 1)))
						sampled_texts.append(" ".join(sample))

		else:
			# Make short random samples and add to sampled texts
			# Remove punctuation in the meantime
			print("::: training set «{} {}» is randomly sampled from corpus :::".format(author, "+".join(pooled_titles)))
			pooled_texts = " ".join(pooled_texts)
			pooled_texts = tokenizer.tokenize_sentences(pooled_texts)
			if len(pooled_texts) < 20:
				print("-----| ERROR: please check if input texts have punctuation, tokenization returned only {} sentence(s) |-----".format(len(pooled_texts)))
				break
			for _ in range(1, n_samples+1):
				random_sample = []
				while len(" ".join(random_sample).split()) <= sample_size:
					random_sample.append(random.choice(pooled_texts))
				for index, word in enumerate(random_sample):
					random_sample[index] = "".join([char for char in word if char not in punctuation])
				random_sample = " ".join(random_sample).split()[:sample_size]
				sampled_authors.append(author)
				sampled_titles.append('sample_{}'.format(_))
				sampled_texts.append(" ".join(random_sample))

	return sampled_authors, sampled_titles, sampled_texts
Beispiel #41
0
import re

'''Finds the part of the string which comes after the delimiter'''
def substring_after(s, delim):
    return s.partition(delim)[2]

'''Finds the part of the string which comes before the delimiter'''
def substring_before(s, delim):
    return s.partition(delim)[0]

'''Finds the part of the string which comes between the two delimiter'''
def substring_before_after(s, delim1, delim2):
    temp = s.partition(delim1)[2]
    return temp.partition(delim2)[0]

tokenizer = TokenizeSentence('bengali')
f = open("data.txt","r")

lines = f.readlines()
lines = [x.rstrip() for x in lines] 
i = 0
tokenized_list = []
pattern = "^([0-9])*\)"
bengali_text_tokenize = []

for line in lines:
    if re.search(pattern,line):
        tokenized_list.append(bengali_text_tokenize)
        bengali_text_tokenize = []
    
    bengali_text_tokenize += tokenizer.tokenize(line)
Beispiel #42
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Beispiel #43
0
param = Namespace(
    raw_text='../data/HomerGesamt_cleaned.txt',
    stopwords='../data/stopwords.txt',
    window=15,  # quite high but useful for semantic analysis
    train_prop=0.7,
    val_prop=0.15,
    test_prop=0.15,
    output='../data/Homer_cbow_preprocessed.csv',
    MASK="<SENT_BOUND>")

# load file
homer = load_file(param.raw_text)

# Sentence tokenizer
greek_tokenizer = TokenizeSentence('greek')
homer_sentences = greek_tokenizer.tokenize(homer)


# clean tokens
def clean_delete_stopwords(sentences):
    '''

    :param sentences: a list of sentences
    :return: the same list whitout stopwords and with spacing after punctuation
    '''
    new_sentences = []
    for s in sentences:
        s = re.sub(r"([.,!?])", r" \1 ", s)
        tokens = delete_stopwords(stopwords_file=param.stopwords, text=s)
        tokens = ' '.join(w for w in tokens)