예제 #1
0
 def test_sentence_tokenizer_marathi(self):
     """Test tokenizing marathi sentences."""
     text = "अर्जुन उवाच । एवं सतत युक्ता ये भक्तास्त्वां पर्युपासते । ये चाप्यक्षरमव्यक्तं तेषां के योगवित्तमाः ॥"
     target = ['अर्जुन', 'उवाच', '।', 'एवं', 'सतत', 'युक्ता', 'ये', 'भक्तास्त्वां', 'पर्युपासते', '।', 'ये', 'चाप्यक्षरमव्यक्तं', 'तेषां', 'के', 'योगवित्तमाः', '॥']
     tokenizer = TokenizeSentence('marathi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
예제 #2
0
 def test_sentence_tokenizer_sanskrit(self):
     """Test tokenizing sanskrit sentences."""
     text = "श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः। नानाविधानि दिव्यानि नानावर्णाकृतीनि च।।"
     target = ['श्री', 'भगवानुवाच', 'पश्य', 'मे', 'पार्थ', 'रूपाणि', 'शतशोऽथ', 'सहस्रशः', '।', 'नानाविधानि', 'दिव्यानि', 'नानावर्णाकृतीनि', 'च', '।', '।']
     tokenizer = TokenizeSentence('sanskrit')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
예제 #3
0
 def test_sentence_tokenizer_classical_hindi(self):
     """Test tokenizing classical_hindi sentences."""
     text = "जलर्  चिकित्सा से उन्हें कोई लाभ नहीं हुआ।"
     target = ['जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।']
     tokenizer = TokenizeSentence('hindi')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
def cleaning_data(str):
    tokenizer = TokenizeSentence('bengali')
    bengali_text_tokenize = tokenizer.tokenize(str)
    # print(bengali_text_tokenize)
    cleaned = clean(bengali_text_tokenize)
    cleaned = ' '.join(cleaned)
    return cleaned
예제 #5
0
 def test_sentence_tokenizer_telugu(self):
     """Test tokenizing telugu sentences."""
     text = "తా. ఎక్కడెక్కడ బుట్టిన నదులును రత్నాకరుడను నాశతో సముద్రుని చేరువిధముగా నెన్నియిక్కట్టులకైన నోర్చి ప్రజలు దమంతట దామె ప్రియముం జూపుచు ధనికుని యింటికేతెంచుచుందురు."
     target = ['తా', '.', 'ఎక్కడెక్కడ', 'బుట్టిన', 'నదులును', 'రత్నాకరుడను', 'నాశతో', 'సముద్రుని', 'చేరువిధముగా', 'నెన్నియిక్కట్టులకైన', 'నోర్చి', 'ప్రజలు', 'దమంతట', 'దామె', 'ప్రియముం', 'జూపుచు', 'ధనికుని', 'యింటికేతెంచుచుందురు', '.']
     tokenizer = TokenizeSentence('telugu')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
예제 #6
0
 def test_sentence_tokenizer_bengali(self):
     """Test tokenizing bengali sentences."""
     text = "দুর্ব্বাসার শাপে রাজা শকুন্তলাকে একেবারে ভুলে বেশ সুখে আছেন।"
     target = ['দুর্ব্বাসার', 'শাপে', 'রাজা', 'শকুন্তলাকে', 'একেবারে', 'ভুলে', 'বেশ', 'সুখে', 'আছেন', '।']
     tokenizer = TokenizeSentence('bengali')
     tokenized_sentences = tokenizer.tokenize(text)
     self.assertEqual(tokenized_sentences, target)
def porter_tokenizer(text):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    tokenizer = TokenizeSentence('bengali')
    bengali_text_tokenize = tokenizer.tokenize(text)
    bengali_text_tokenize
    
    
    return bengali_text_tokenize
예제 #8
0
파일: test_tokenize.py 프로젝트: cltk/cltk
    def test_sentence_tokenizer_sanskrit(self):
        """Test tokenizing Sanskrit sentences."""
        text = """श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः। यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।
न मे विदुः सुरगणाः प्रभवं न महर्षयः। अहमादिर्हि देवानां महर्षीणां च सर्वशः।।"""
        target = ['श्री भगवानुवाच भूय एव महाबाहो श्रृणु मे परमं वचः।','यत्तेऽहं प्रीयमाणाय वक्ष्यामि हितकाम्यया।।', 'न मे विदुः सुरगणाः प्रभवं न महर्षयः।', 'अहमादिर्हि देवानां महर्षीणां च सर्वशः।।']
        tokenizer = TokenizeSentence('sanskrit')
        tokenized_sentences = tokenizer.tokenize(text)
        self.assertEqual(tokenized_sentences, target)
def preprocess_doc(
    sent,
    params={
        'remove_numbers': False,
        'remove_emoji': True,
        'remove_stop_words': True,
        'tokenize': True
    }):
    '''This function should implememnt a multi-lingual tokenizer '''
    '''input: a document / sentence , params is a dict of control sequence'''
    '''output: should return a token list for the entire document/sentence'''

    sent = emoji.demojize(sent)
    sent = re.sub(r"http\S+", '', sent)
    sent = re.sub(r"www.\S+", '', sent)

    if (params['remove_numbers'] == True):
        sent = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", sent)
    sent = re.sub(r"/-", " ", sent)
    sent = re.sub(r"#,\,", " ", sent)
    tokenizer = TokenizeSentence('hindi')
    sents = tokenizer.tokenize(sent)
    all_sents = []

    for s in sents:
        if (params['remove_emoji'] == True):
            s = re.sub(r":\S+:", "", s)
        else:
            s = re.sub(r"[:\*]", "", s)

        punc = set(punctuation) - set('.')

        newtext = []
        for k, g in groupby(s):
            if k in punc:
                newtext.append(k)
            else:
                newtext.extend(g)

        s = ''.join(newtext)

        s = re.sub('[' + re.escape(''.join(puncts)) + ']', '', s)
        s = s.lower()
        if (params['tokenize'] == True):
            msg = tok.tokenize(s)
        else:
            msg = s

        if ((params['tokenize'] == True)
                and (params['remove_stop_words'] == True)):
            msg_filtered = [word for word in msg if word not in stop_for_this]
        else:
            msg_filtered = msg
        if (len(msg_filtered) > 0):
            all_sents.append(msg_filtered)

    return all_sents
예제 #10
0
 def test_classical_hindi_stops(self):
     """
     Test filtering classical hindi stopwords
     Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt)
     """
     sentence = " वह काबुली फिर वहां आकर खडा हो गया है  "
     tokenizer = TokenizeSentence('hindi')
     tokens = tokenizer.tokenize(sentence)
     no_stops = [word for word in tokens if word not in HINDI_STOPS]
     target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया']
     self.assertEqual(no_stops, target_list)
예제 #11
0
def createCorpus(text, save=True):
    '''
    :params text - the raw text

    returns  + the corpus, a list of list with tokenized sentences
             + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words.

    '''
    with open('../../data/stopwords.txt', 'r', encoding="UTF-8") as src:
        stopwords = src.read()

    stopwords = stopwords.split('\n')
    stopwords.extend([".", ",", "?", "!", "-", ":", ";", "·"])

    Stokenizer = TokenizeSentence('greek')
    Wtokenizer = WordTokenizer('greek')
    sentences = Stokenizer.tokenize(text)
    new_sentences = []
    vocab = dict()
    print('Building corpus and freqDictionary')
    for sent in tqdm(sentences, desc="Sentences"):
        new_sent = Wtokenizer.tokenize(sent)
        # Stopword deletion
        new_sent = [w for w in new_sent if w not in stopwords]
        new_sentences.append(new_sent)
        for w in new_sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    vocab_size = len(vocab)
    for k, v in vocab.items():
        # Subsampling, see paper by Goldberg & Levy
        frac = v / vocab_size
        p_w = (1 + np.sqrt(frac * 0.001)) * 0.001 / frac
        # update the value for the word
        vocab[k] = p_w
    if save:
        print('Saving the frequencies')
        with open('../../data/vocabularies/Homer_word_frequencies.json',
                  'w',
                  encoding='utf-8') as fp:
            json.dump(vocab, fp, ensure_ascii=False)
        print('Saving the corpus')
        arr = np.array(new_sentences, dtype=object)
        np.save('../../data/Homer_tokenized_corpus.npy', arr)
    return new_sentences, vocab
예제 #12
0
def bangla_tokenize(text):
    """Gets the spreadsheet's header column named 'bengali_version' and toeknize each text based on that particular grammar"

                        Parameters
                        ----------
                        text : str
                            The texts retrieved from the spreadsheet

                        Returns
                        -------
                        list
                            a list of tokens
                        """
    x = []
    for line in text:
        tokenizer = TokenizeSentence('bengali')
        bengali_text_tokenize = tokenizer.tokenize(line)
        x.insert(0, bengali_text_tokenize)
    return x[::-1]
예제 #13
0
def createCorpus(text, save=True):
    '''
    :params text - the raw text

    returns  + the corpus, a list of list with tokenized sentences
             + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words.

    '''
    # load stopwords
    with open('../data/stopwords.txt', 'r', encoding="UTF-8") as src:
        stopwords = src.read()

    # add punctuation signs
    stopwords = stopwords.split('\n')
    stopwords.extend(
        [".", ",", "?", "!", "-", ":", ";", "·", "”", "“", "«", "»"])

    # tokenize sentences and then words
    Stokenizer = TokenizeSentence('greek')
    Wtokenizer = WordTokenizer('greek')

    sentences = Stokenizer.tokenize(text)
    new_sentences = []
    vocab = dict()

    print('Building corpus and freqDictionary')
    total_tokens = 0
    check = 0
    # for each sentence
    for sent in tqdm(sentences, desc="Sentences"):
        # extract the words
        new_sent = Wtokenizer.tokenize(sent)
        check += len(new_sent)
        # Stopword deletion
        new_sent = [w for w in new_sent if w not in stopwords]
        new_sentences.append(new_sent)
        total_tokens += len(new_sent)
        # add each word to dictionary or update count
        for w in new_sent:
            # Increment tokens count
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1
    vocab_size = len(vocab)

    print("total tokens: ", total_tokens)
    print("total token (incl. stopwords)", check)
    print("vocab_size : ", vocab_size)
    # Subsampling
    treshold = 10e-05
    for k, v in vocab.items():
        # http: // mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
        # Not really used for subsampling here but to generate the noise distribution
        frac = v / total_tokens
        p_w = (1 + math.sqrt(frac / treshold)) * (treshold / frac)
        vocab[k] = p_w

    if save:
        print('Saving the frequencies')
        with open(args.word_frequencies, 'w', encoding='utf-8') as fp:
            json.dump(vocab, fp, ensure_ascii=False)

        print('Saving the corpus')
        arr = np.array(new_sentences, dtype=object)
        np.save('../data/Homer_tokenized_accented.npy', arr)

        with open('../data/vocabs/Homer_wordList.csv', "w",
                  encoding="utf-8") as fp:
            for idx, word in tqdm(enumerate(vocab)):
                fp.write(str(idx) + "," + word + "\n")

    return new_sentences, vocab
def tokenizer(str):
    tokenizer = TokenizeSentence('bengali')
    bengali_text_tokenize = tokenizer.tokenize(str)
    return bengali_text_tokenize
def text_file_read(book_name, choice):
    global L1
    File1_Open = open(str(book_name), encoding="utf8")
    Txt_List = []
    for Line_in_File in File1_Open:
        Txt_List.append(Line_in_File)
    #print(len(Txt_List))
    s = (" ".join(Txt_List))
    if choice != 1:
        text = re.sub("[?!]", "ред", s)
        #print("[>>>",len(text))
        text = re.sub("[ ]+", " ", text)
        text = re.sub("[\n]+", "\n", text)
        text = re.sub("[\t]+", "", text)
        text = "".join([s for s in text.strip().splitlines(True) if s.strip()])
        with open(str(book_name), "w", encoding="utf-8") as f:
            f.write(text)
        tokenizer = TokenizeSentence('hindi')
        l1 = tokenizer.tokenize(text)
        l11 = []
        for po in range(len(l1)):
            kl = re.split("ред", str(l1[po]))
            l11.extend(kl)
        """for ind, i in enumerate(l1):
            text=i
            s2=""
            tlist = []
            tlist1 = []
            #print(text)
            try:
                driver.get("https://translate.google.com/#view=home&op=translate&sl=hi&tl=en&text={}".format(text))
                time.sleep(1)
                try:
                    content = driver.find_element_by_css_selector('.tlid-translation.translation')       # tlid-translation.translation    .gt-baf-table
                    txt = content.text.split('\n')
                    for t in txt:
                        if re.sub('[^A-Za-z ]', '', t):
                            tlist1.append(t)
                    s2=(" ".join(tlist1))
                    l2.append(s2)
                except Exception as e:
                    l2.append(s2)
                    print(e)
                if ind % 10 == 0:
                    print(ind)
                    Data_frame_Translation=pd.DataFrame()
                    Data_frame_Translation['Hindi']=l1[:ind+1]
                    Data_frame_Translation['Translated_Hindi']=l2
                    Data_frame_Translation.to_excel(str(book_name)+'.xlsx',index=False)
                    
            except:
                l2.append(s2)
                driver = webdriver.PhantomJS('C:/Users/User/Downloads/phantomjs-2.1.1-windows/bin/phantomjs')"""

        Data_frame_Translation = pd.DataFrame()
        Data_frame_Translation['Hindi'] = l1
        L1.extend(l1)
        Data_frame_Translation.to_excel(str(book_name) + '.xlsx', index=False)
    else:
        text = re.sub("[ ]+", " ", s)
        text = re.sub("[\n]+", "\n", text)
        text = re.sub("[\t]+", "", text)
        text = "".join([s for s in text.strip().splitlines(True) if s.strip()])
        with open(str(book_name), "w", encoding="utf-8") as f:
            f.write(text)
        l1 = sent_tokenize(text)
        Data_frame_Translation = pd.DataFrame()
        Data_frame_Translation['English'] = l1
        Data_frame_Translation.to_excel(str(book_name) + '.xlsx', index=False)
from translate import translator
#nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from openpyxl.workbook import Workbook
e=[]
h=[]

df=pd.read_excel("E:/hindi_english_downloaded_split/english_corpora/maths/6th/train_paragraph_level.xlsx")
print(df.columns)

tokenizer = TokenizeSentence('hindi')
for index, row in df.iterrows():
    s=str(row["Hindi"])
    s1=str(row["English"])
    s=re.sub("[!?.]","ред",s)
    l1=tokenizer.tokenize(s)
    l2=sent_tokenize(s1)
    if len(l1)>len(l2):
        for i in range(abs(len(l1)-len(l2))):
            l2.append("None")
    elif len(l1)<len(l2):
        for i in range(abs(len(l1)-len(l2))):
            l1.append("None")
    e.extend(l2)
    h.extend(l1)




df=pd.read_excel("E:/hindi_english_downloaded_split/english_corpora/maths/6th/missmatched_section.xlsx")
print(df.columns)
예제 #17
0
    infile = open(infilename, "r")

    string = infile.readline()
    while string:
        string = string[:-1]
        filestring.append(string)
        string = infile.readline()

    infile.close()
    outfile = open(outfilename, "w")

    for shlok in filestring:

        #picking one shloka from the file
        t_shlok = tokenizer.tokenize(shlok)
        #initializing the flags
        count = 0  # to count the number of phonemes after which the split has to be done
        pos = 0  # to insert the -
        diff = 0  # to keep track of the overflow phonemes

        for i in range(len(t_shlok)):
            token = t_shlok[i]
            split = syl.orthographic_syllabify(token)
            l = len(split)

            # phonemes already covered
            prev = count

            #checking for purna-viram and numbers
            if l == 1 and check_token(token) == False:
예제 #18
0
 def candidate_words(self, stripped_input):
     from cltk.tokenize.sentence import TokenizeSentence
     tokenizer = TokenizeSentence('bengali')
     tokens = tokenizer.tokenize(stripped_input)
     return tokens
예제 #19
0
                    t = Tokenizer()
                    t.read_from_file(PATH + file + '/' + inner_file + '/' +
                                     inner_inner_file)
                    split_shit = t.generate_sentences()
                    final_split_shit = []
                    for i in split_shit:
                        hello = re.split('\?|\!', i)
                        for k in hello:
                            final_split_shit.append(k)
                    filtered_final_split_shit = []
                    for i in final_split_shit:
                        if (not (bool(re.match('^\s+$', i)))):
                            filtered_final_split_shit.append(i)
                    words = []
                    for i in filtered_final_split_shit:
                        sentence_tokenized = tokenizer.tokenize(i)
                        for k in sentence_tokenized:
                            words.append(k.strip('\n'))
                    length = [
                        len(tokenizer.tokenize(i))
                        for i in filtered_final_split_shit
                    ]
                    one = statistics.mean(length)
                    two = statistics.stdev(length)
                    vocabulary = set(words)
                    three = len(vocabulary) / len(words)
                    feature = []
                    feature = [one, two, three]
                    features.append(feature)

file = open("../pickle/features.pkl", 'wb')