def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() output_file_name = rename_filename_with_extension( os.path.basename(input_file_path), self.final_format) intermediate_filename = str(time.time()).replace('.', '') + '.html' output_file_path = os.path.join(self.tmp_dir, output_file_name) intermediate_path = os.path.join( self.tmp_dir, intermediate_filename) with codecs.open(input_file_path, "r", "utf-8") as f: cleaned_content = remove_tags(f.read()) with open(intermediate_path, 'w') as w: w.write(cleaned_content) converter = CONVERTER_LOCATION.format( input_file_path=intermediate_path, output_file_path=output_file_path) self.execute(converter) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object) log.error('Conversion failed from HTML => PDF') return None
def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() log.info('Converting {} to TXT'.format(input_file_path)) html_to_text = html2text.HTML2Text() html_to_text.ignore_links = html_to_text.ignore_images = True try: input_stream = input_file_object.get_input_stream() except UnicodeDecodeError, e: self.handle_failed_conversion(input_file_object) log.error( 'Conversion failed from HTML => TXT for {} {}'.format( input_file_path, e) ) return None soup = BeautifulSoup(input_stream) invalid_attrs = 'href src width height target \ style color face size script'.split() for attr in invalid_attrs: [dom_el.extract() for dom_el in soup(attr)] input_stream = unicode(soup) try: output_stream = html_to_text.handle(input_stream) except e: self.handle_failed_conversion(input_file_object) log.error( 'Conversion failed from HTML => TXT for {} {}'.format( input_file_path, e) ) return None output_file_name = rename_filename_with_extension( os.path.basename(input_file_object.get_input_file_path()), self.final_format) output_file_path = os.path.join(self.tmp_dir, output_file_name) write_stream(output_file_path, output_stream) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object)
def _single_convert(self, input_file_object): if input_file_object: input_file = input_file_object.get_input_file_path() output_file_name = rename_filename_with_extension( os.path.basename(input_file), self.final_format) output_file_path = os.path.join(self.tmp_dir, output_file_name) converter = CONVERTER_LOCATION.format(input_file_path=input_file, output_file_dir=self.tmp_dir) self.execute(converter) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object) log.error('Conversion failed from PDF => HTML') return None
def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() log.info('Converting {} to TXT'.format(input_file_path)) html_to_text = html2text.HTML2Text() html_to_text.ignore_links = html_to_text.ignore_images = True try: input_stream = input_file_object.get_input_stream() except UnicodeDecodeError, e: self.handle_failed_conversion(input_file_object) log.error( 'Conversion failed from HTML => TXT for {} {}'.format( input_file_path, e)) return None soup = BeautifulSoup(input_stream) invalid_attrs = 'href src width height target \ style color face size script'.split() for attr in invalid_attrs: [dom_el.extract() for dom_el in soup(attr)] input_stream = unicode(soup) try: output_stream = html_to_text.handle(input_stream) except e: self.handle_failed_conversion(input_file_object) log.error( 'Conversion failed from HTML => TXT for {} {}'.format( input_file_path, e)) return None output_file_name = rename_filename_with_extension( os.path.basename(input_file_object.get_input_file_path()), self.final_format) output_file_path = os.path.join(self.tmp_dir, output_file_name) write_stream(output_file_path, output_stream) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object)
def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() output_file_name = rename_filename_with_extension( os.path.basename(input_file_path), 'pdf') output_file_path = self.tmp_dir converter = CONVERTER_LOCATION.format( output_file_path=output_file_path, input_file_path=input_file_path) self.execute(converter) output_file = os.path.join(output_file_path, output_file_name) if os.path.isfile(output_file): return output_file else: self.handle_failed_conversion(input_file_object) log.error('Conversion failed from DOC => PDF') return None
def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() output_file_name = rename_filename_with_extension( os.path.basename(input_file_path), 'pdf') output_file_path = TMP_DIR converter = CONVERTER_LOCATION.format( output_file_path=output_file_path, input_file_path=input_file_path) self.execute(converter) output_file = os.path.join(output_file_path, output_file_name) if os.path.isfile(output_file): return output_file else: self.handle_failed_conversion(input_file_path) log.error('Conversion failed from RTF => PDF') return None
def _single_convert(self, input_file_object): if input_file_object: input_stream = input_file_object.get_input_stream() try: output_stream = markdown2.markdown(input_stream) except: print "Conversion Unsuccessfull for txt_html" return None output_file_name = rename_filename_with_extension( os.path.basename(input_file_object.get_input_file_path()), self.final_format) output_file_path = os.path.join(self.tmp_dir, output_file_name) write_stream(output_file_path, output_stream) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object) log.error('Conversion failed from TXT => HTML') return None
def get_remote_location(self): filename = rename_filename_with_extension(self.file_instance.filename, self.output_format) return os.path.join(app.config['REMOTE_DUMP_FOLDER'], self.doc_id, filename)