def tokenisation_core(self, paragraph_data, text_locale): try: tokenised_text = [] if text_locale == 'en': for paragraph in paragraph_data: tokenised_sentence_data = AnuvaadEngTokenizer().tokenize( paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'hi': for paragraph in paragraph_data: tokenised_sentence_data = AnuvaadHinTokenizer().tokenize( paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'kn': for paragraph in paragraph_data: tokenised_sentence_data = AnuvaadKanTokenizer().tokenize( paragraph) tokenised_text.extend(tokenised_sentence_data) return tokenised_text except: log_exception( "tokenisation_core : Error occured during tokenising the paragraphs", self.input_json_data, None) raise ServiceError( 400, "Tokenisation failed. Something went wrong during tokenisation." )
def tokenisation_core(self, paragraph_data, text_locale): tokenised_text = [] for paragraph in paragraph_data: if paragraph is not None: try: paragraph = self.remove_extra_spaces(paragraph) if text_locale == 'en': tokenised_sentence_data = AnuvaadEngTokenizer( ).tokenize(paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'hi' or text_locale == 'mr': tokenised_sentence_data = AnuvaadHindiTokenizer( ).tokenize(paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'kn': tokenised_sentence_data = AnuvaadKannadaTokenizer( ).tokenize(paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'ta': tokenised_sentence_data = AnuvaadTamilTokenizer( ).tokenize(paragraph) tokenised_text.extend(tokenised_sentence_data) elif text_locale == 'ml': tokenised_sentence_data = AnuvaadMalayalamTokenizer( ).tokenize(paragraph) tokenised_text.extend(tokenised_sentence_data) except: log_exception( "Received error in this text : %s" % (paragraph), self.input_json_data, None) raise ServiceError( 400, "Tokenisation failed. Something went wrong during tokenisation." ) return tokenised_text
def tokenisation_response(self, input_file_data, in_locale, index): try: output_filepath, output_filename = file_ops.output_path( index, self.DOWNLOAD_FOLDER, '.txt') tokenised_data = self.tokenisation_core(input_file_data, in_locale) self.writing_tokenised_sentence_in_file(tokenised_data, output_filepath) return output_filename except: log_exception( "tokenisation_response : Error occured during output file creation", None, None) raise ServiceError( 400, "Tokenisation failed. Something went wrong during output file creation." )
def pdf2html(self, input_pdf_file, jobid): try: output_htmlfiles_path, output_pngfiles_path = pdf_ops.pdf_to_html( self.DOWNLOAD_folder, input_pdf_file) log_info( "pdf2html", "successfully received output filepath for HTML and PNG files", jobid) return output_htmlfiles_path, output_pngfiles_path except: log_exception("pdf2html", "Error occured during pdf to html conversion", jobid, None) raise ServiceError( 400, "pdf2html failed. Something went wrong during conversion.")
def adding_tokenised_text_blockmerger(self, input_json_data_pagewise, in_locale, page_id): try: blocks = input_json_data_pagewise['text_blocks'] for block_id, item in enumerate(blocks): text_data = item['text'] tokenised_text = self.tokenisation_core([text_data], in_locale) item['tokenized_sentences'] = [ self.making_object_for_tokenised_text( text, in_locale, i, block_id, page_id) for i, text in enumerate(tokenised_text) ] return input_json_data_pagewise except: log_error( "Keys in block merger response changed or tokenisation went wrong.", self.input_json_data, None) raise ServiceError( 400, "Tokenisation failed. Keys in block merger response changed or tokenisation went wrong." )