コード例 #1
0
def get_translation(gen, sp, text, srclang):
    ogtext = text.strip()
    original_text = "" + ogtext
    if srclang == "hi":
        if ogtext[-1] != "|":
            ogtext = ogtext + " |"
    if srclang == "en":
        if ogtext[-1] != "." and ogtext[-1] != "?" and ogtext[
                -1] != "!" and ogtext[-1] != "।":
            ogtext = ogtext + "."
    text = "" + ogtext
    mr_number_map = {
        '०': '0',
        '१': '1',
        '२': '2',
        '३': '3',
        '४': '4',
        '५': '5',
        '६': '6',
        '७': '7',
        '८': '8',
        '९': '9'
    }
    for mrnum, ennum in mr_number_map.items():
        text = text.replace(mrnum, ennum)

    if srclang == "en":
        textarr = sent_tokenize(text)
        ogtextarr = sent_tokenize(ogtext)
    if srclang == "hi":
        textarr = sentence_tokenize.sentence_split(text, lang='hi')
        ogtextarr = sentence_tokenize.sentence_split(ogtext, lang='hi')
    textfinal = ""
    outtextfinal = ""
    for textid, text in enumerate(textarr):
        text = str(text).strip().lower()
        if text == "" or text == "।" or text == "\." or text == "." or text == "," or text == "|" or text == "?" or text == "!" or text == ";":
            continue
        if srclang == "hi":
            if text[-1] != "|" and text[-1] != "?" and text[
                    -1] != "!" and text[-1] == ".":
                text = text[:-1] + " |"
            if text[-1] != "|" and text[-1] != "?" and text[-1] != "!":
                text = text + " |"
        tokentext = " ".join(sp.encode_as_pieces(str(text).strip().lower()))
        transtext = [
            x.strip() for x in gen.generate(tokentext.strip()).split()
        ]
        outtext = sp.decode_pieces(transtext)
        outtextfinal = outtextfinal + " " + outtext
        textfinal = textfinal + " " + ogtextarr[textid]
    outtextfinal = outtextfinal.replace(" | ", "|").replace(
        " . ", ".").replace(" ? ", "?").replace(" ! ", "!").strip()
    return outtextfinal
コード例 #2
0
def clean_captions (captions : List) -> List:
	clean_captions = { 'annotations' : [] }
	for img in tqdm (captions):
		sentences=sentence_tokenize.sentence_split(img ['caption'], lang='hi')

		# print (f'raw - {sentences} , len - {len (sentences)}')

		uniq_sentences = set (sentences)
		
		# print (f'uniq - {uniq_sentences}  , len - {len (uniq_sentences)}')

		final_sentences = []
		for sent in uniq_sentences:
			# print (f'sent - {sent}')
			if not has_english_char (sent):
				# print ('Added')
				final_sentences.append (sent)
			# else:
			# 	print ('not added')

		clean_captions ['annotations'].append ({
			'image_id' : img ['image_id'],
			'caption' : ' '.join (final_sentences)
		})
	return clean_captions
コード例 #3
0
def s_g_20Hindi(text):
    count=0
    num_syl=0
    sentences=sentence_tokenize.sentence_split(text, lang='hi')
    for i in sentences:
        if(syllables.estimate(i)>20):
            count+=1
    return count   
コード例 #4
0
 def split_indic(line: str) -> tp.Iterable[str]:
     """Split Indian text into sentences using Indic NLP tool."""
     line = indic_normalizer.normalize(line)
     for sent in indic_sent_tok.sentence_split(line, lang=lang):
         yield sent
コード例 #5
0
ENG_TXT = "/home/eng_txt.txt"
HIN_TXT = "/home/hin_txt.txt"

hi_df = pd.read_csv(HIN_CSV)
lst = hi_df['data'].to_list()

#iterate through items in list and get paragraphs
fulllst = []
for item in lst:
    sen = literal_eval(item)
    fulllst = fulllst + sen

#tokenizing scraped hindi paragraphs using indic NLP
with open(HIN_TXT, "w", encoding="utf-16") as fobj:
    for x in fulllst:
        sentences = sentence_tokenize.sentence_split(x, lang='hi')
        for t in sentences:
            fobj.write(t + "\n")

#iterate through items in list and get paragraphs
en_df = pd.read_csv(ENG_CSV)
lst = en_df['data'].to_list()
fulllst = []
for item in lst:
    sen = literal_eval(item)
    fulllst = fulllst + sen

#tokenizing scraped English sentences using NLTK
with open(ENG_TXT, "w", encoding="utf-16") as fobj:
    for x in fulllst:
        sentences = sent_tokenize(x)
コード例 #6
0
def sentenceCountHindi(text):
    #text=removePunctuation(text)
    sentences=sentence_tokenize.sentence_split(text, lang='hi')
    return (len(sentences))
コード例 #7
0
 def split(self, line):
     lines = sentence_tokenize.sentence_split(line, self.language)
     return lines