def process_gdocs_resource(self, save_dir, gdocs_resource_id, gdocs_access_token=None): # login to gdocs and get a client object gd_client = getAuthorizedGoogleDocsClient() # Create a AuthSub Token based on gdocs_access_token String auth_sub_token = gdata.gauth.AuthSubToken(gdocs_access_token) \ if gdocs_access_token \ else None # get the Google Docs Entry gd_entry = gd_client.GetDoc(gdocs_resource_id, None, auth_sub_token) # Get the contents of the document gd_entry_url = gd_entry.content.src html = gd_client.get_file_content(gd_entry_url, auth_sub_token) # Transformation and get images cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True) cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) validate_cnxml(cnxml) # Return the title and filename. Old comment states # that returning this filename might kill the ability to # do multiple tabs in parallel, unless it gets offloaded # onto the form again. return (gd_entry.title.text, "Google Document")
def process(self): try: f = open(self.original_filename) latex_archive = f.read() # LaTeX 2 CNXML transformation cnxml, objects = latex_to_cnxml(latex_archive, self.original_filename) cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self): try: # Convert from other office format to odt if needed filename, extension = os.path.splitext(self.original_filename) odt_filename = str(filename) + '.odt' if(extension != '.odt'): self._convert_to_odt(filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(self.save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self, form): try: # Convert from other office format to odt if needed filename, extension = os.path.splitext(self.original_filename) odt_filename = str(filename) + '.odt' if (extension != '.odt'): self._convert_to_odt(filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(self.save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def _process_gdocs_resource(klass, save_dir, html): # Transformation and get images cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True) cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) validate_cnxml(cnxml) return "Google Document"
def process(self, form): try: f = open(self.original_filename) latex_archive = f.read() # LaTeX 2 CNXML transformation cnxml, objects = latex_to_cnxml(latex_archive, self.original_filename) cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def _process_gdocs_resource(klass, save_dir, html, kix=None): # Transformation and get images cnxml, objects = gdocs_to_cnxml(html, kixcontent=kix, bDownloadImages=True) cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) validate_cnxml(cnxml) return "Google Document"
def process(self): try: url = self.form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r: gdocs_resource_id = r.groups()[0] doc_id = "document:" + gdocs_resource_id title, filename = self.process_gdocs_resource(self.save_dir, doc_id) self.request.session['title'] = title self.request.session['filename'] = filename else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def gdoc2html(request): session = request.session # grab inputs if 'html' in request.POST: html = request.POST['html'] else: return if 'textbook_html' in request.POST: textbook_html = request.POST['textbook_html'] is '1' else: textbook_html = True if 'copy_images' in request.POST: copy_images = request.POST['copy_images'] is '1' else: copy_images = False # be anonymous session['login'] = AnonymousSession() # setup work directory: save_dir = transform_dir + user_subdir_name transform_dir = request.registry.settings['transform_dir'] user_subdir_name, save_dir = create_save_dir(request) # allow cross domain access request.response.headers.add('Access-Control-Allow-Origin', '*') # convert gdoc html to cnxml to textbook (aka structured) html or aloha-ready html cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=copy_images) cnxml = clean_cnxml(cnxml) title = None metadata = None alohareadyhtml, structuredhtml, conversion_error = update_html( cnxml, title, metadata) if conversion_error is None: if textbook_html: html = structuredhtml else: html = alohareadyhtml else: html = "" jsonresult = { "html": html, "textbook_html": textbook_html, "copy_images": copy_images, } return jsonresult
def process(self, zip_filename): try: self.zip_archive = zipfile.ZipFile(zip_filename, 'r') # Unzip into transform directory self.zip_archive.extractall(path=self.save_dir) # Rename ZIP file so that the user can download it again os.rename(zip_filename, os.path.join(self.save_dir, 'upload.zip')) # Read CNXML with open(os.path.join(self.save_dir, 'index.cnxml'), 'rt') as fp: cnxml = fp.read() # Convert the CNXML to XHTML for preview html = cnxml_to_htmlpreview(cnxml) with open(os.path.join(self.save_dir, 'index.xhtml'), 'w') as index: index.write(html) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self, zip_filename): try: self.zip_archive = zipfile.ZipFile(zip_filename, 'r') # Unzip into transform directory self.zip_archive.extractall(path=self.save_dir) # Rename ZIP file so that the user can download it again os.rename(zip_filename, os.path.join(self.save_dir, 'upload.zip')) # Read CNXML with open(os.path.join(self.save_dir, 'index.cnxml'), 'rt') as fp: cnxml = fp.read() # Convert the CNXML to XHTML for preview html = cnxml_to_htmlpreview(cnxml) with open(os.path.join(self.save_dir, 'index.xhtml'), 'w') as index: index.write(html) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self, form): try: url = form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r is not None: gdocs_resource_id = r.groups()[0] http = httplib2.Http() http.follow_redirects = False try: resp, html = http.request( 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % gdocs_resource_id) except HttpError: pass else: # Check that status was OK, google docs sends a redirect to a login # page if not. if resp.status / 100 == 2: # Get the title (does not work anymore) title = 'Untitled Google Document' # Process it P = GoogleDocProcessor(self.request) return P.process_gdocs_resource(html, title, form) self.request.session.flash('Failed to convert google document') return HTTPFound(location=self.request.route_url('choose')) else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self, form): try: url = form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r is not None: gdocs_resource_id = r.groups()[0] http = httplib2.Http() http.follow_redirects = False try: resp, html = http.request( 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % gdocs_resource_id) resp2, kix = http.request( 'https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=kix' % gdocs_resource_id) except HttpError: pass else: # Check that status was OK, google docs sends a redirect to a login # page if not. if resp.status / 100 == 2: # Get the title title = 'Untitled Google Document' # Process it P = GoogleDocProcessor(self.request) return P.process_gdocs_resource(html, title, form, kix) self.request.session.flash('Failed to convert google document') return HTTPFound(location=self.request.route_url('choose')) else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))