def get_translation(gen, sp, text, srclang): ogtext = text.strip() original_text = "" + ogtext if srclang == "hi": if ogtext[-1] != "|": ogtext = ogtext + " |" if srclang == "en": if ogtext[-1] != "." and ogtext[-1] != "?" and ogtext[ -1] != "!" and ogtext[-1] != "।": ogtext = ogtext + "." text = "" + ogtext mr_number_map = { '०': '0', '१': '1', '२': '2', '३': '3', '४': '4', '५': '5', '६': '6', '७': '7', '८': '8', '९': '9' } for mrnum, ennum in mr_number_map.items(): text = text.replace(mrnum, ennum) if srclang == "en": textarr = sent_tokenize(text) ogtextarr = sent_tokenize(ogtext) if srclang == "hi": textarr = sentence_tokenize.sentence_split(text, lang='hi') ogtextarr = sentence_tokenize.sentence_split(ogtext, lang='hi') textfinal = "" outtextfinal = "" for textid, text in enumerate(textarr): text = str(text).strip().lower() if text == "" or text == "।" or text == "\." or text == "." or text == "," or text == "|" or text == "?" or text == "!" or text == ";": continue if srclang == "hi": if text[-1] != "|" and text[-1] != "?" and text[ -1] != "!" and text[-1] == ".": text = text[:-1] + " |" if text[-1] != "|" and text[-1] != "?" and text[-1] != "!": text = text + " |" tokentext = " ".join(sp.encode_as_pieces(str(text).strip().lower())) transtext = [ x.strip() for x in gen.generate(tokentext.strip()).split() ] outtext = sp.decode_pieces(transtext) outtextfinal = outtextfinal + " " + outtext textfinal = textfinal + " " + ogtextarr[textid] outtextfinal = outtextfinal.replace(" | ", "|").replace( " . ", ".").replace(" ? ", "?").replace(" ! ", "!").strip() return outtextfinal
def clean_captions (captions : List) -> List: clean_captions = { 'annotations' : [] } for img in tqdm (captions): sentences=sentence_tokenize.sentence_split(img ['caption'], lang='hi') # print (f'raw - {sentences} , len - {len (sentences)}') uniq_sentences = set (sentences) # print (f'uniq - {uniq_sentences} , len - {len (uniq_sentences)}') final_sentences = [] for sent in uniq_sentences: # print (f'sent - {sent}') if not has_english_char (sent): # print ('Added') final_sentences.append (sent) # else: # print ('not added') clean_captions ['annotations'].append ({ 'image_id' : img ['image_id'], 'caption' : ' '.join (final_sentences) }) return clean_captions
def s_g_20Hindi(text): count=0 num_syl=0 sentences=sentence_tokenize.sentence_split(text, lang='hi') for i in sentences: if(syllables.estimate(i)>20): count+=1 return count
def split_indic(line: str) -> tp.Iterable[str]: """Split Indian text into sentences using Indic NLP tool.""" line = indic_normalizer.normalize(line) for sent in indic_sent_tok.sentence_split(line, lang=lang): yield sent
ENG_TXT = "/home/eng_txt.txt" HIN_TXT = "/home/hin_txt.txt" hi_df = pd.read_csv(HIN_CSV) lst = hi_df['data'].to_list() #iterate through items in list and get paragraphs fulllst = [] for item in lst: sen = literal_eval(item) fulllst = fulllst + sen #tokenizing scraped hindi paragraphs using indic NLP with open(HIN_TXT, "w", encoding="utf-16") as fobj: for x in fulllst: sentences = sentence_tokenize.sentence_split(x, lang='hi') for t in sentences: fobj.write(t + "\n") #iterate through items in list and get paragraphs en_df = pd.read_csv(ENG_CSV) lst = en_df['data'].to_list() fulllst = [] for item in lst: sen = literal_eval(item) fulllst = fulllst + sen #tokenizing scraped English sentences using NLTK with open(ENG_TXT, "w", encoding="utf-16") as fobj: for x in fulllst: sentences = sent_tokenize(x)
def sentenceCountHindi(text): #text=removePunctuation(text) sentences=sentence_tokenize.sentence_split(text, lang='hi') return (len(sentences))
def split(self, line): lines = sentence_tokenize.sentence_split(line, self.language) return lines