def test_url(self): url_files=os.listdir(test_folder_name+'url/') i=0 while(i < len(url_files)): f=url_files[i] filename, extension = os.path.splitext(f) if(extension != ''): url_files.remove(f) else: i=i+1 for f in url_files: input_file=open(test_folder_name+'url/'+f,'r') url=input_file.readline() input_file.close() output_filename=test_folder_name+'url/'+f+'.cnxml' valid_filename=test_folder_name+'url/'+f+'.cnxml' output_filename=test_folder_name+'url/'+f+'.tmp' diff_filename = test_folder_name+'url/'+f+'.diff' err_filename = test_folder_name+'url/'+f+'.err' import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) output=open(output_filename,'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if(std_output[0] != None and len(std_output[0]) != 0): diff_output=open(diff_filename,'w') diff_output.write(std_output[0]) diff_output.close() print('Differences in the testing of '+f+', information on those differences has been placed in '+diff_filename) elif(std_output[1] != None and len(std_output[1]) != 0): err_output=open(err_filename,'w') err_output.write(std_output[1]) err_output.close() print('Error(s) occurred while attempting to test for differences in CNXML output of '+f+', information on these errors are in '+err_filename) except urllib2.URLError, e: print('URL '+url+' could not be opened') quit()
def process(self): try: url = self.form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r: gdocs_resource_id = r.groups()[0] doc_id = "document:" + gdocs_resource_id title, filename = self.process_gdocs_resource(self.save_dir, doc_id) self.request.session['title'] = title self.request.session['filename'] = filename else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
def process(self, form): try: url = form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r is not None: gdocs_resource_id = r.groups()[0] http = httplib2.Http() http.follow_redirects = False try: resp, html = http.request( 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % gdocs_resource_id) except HttpError: pass else: # Check that status was OK, google docs sends a redirect to a login # page if not. if resp.status / 100 == 2: # Get the title (does not work anymore) title = 'Untitled Google Document' # Process it P = GoogleDocProcessor(self.request) return P.process_gdocs_resource(html, title, form) self.request.session.flash('Failed to convert google document') return HTTPFound(location=self.request.route_url('choose')) else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))
remove_ids(valid_filename) elif(extension=='.html'): print('Assuming this is a file containing a URL') f = filename input_file=open(f,'r') name,extension = os.path.splitext(filename) valid_filename=name+'.cnxml' try: html = input_file.read() input_file.close() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=filename) cnxml = clean_cnxml(cnxml) output=open(valid_filename,'w') output.write(cnxml) output.close() remove_ids(valid_filename) except urllib2.URLError, e: print('URL '+url+' could not be opened') quit() print('Done. Valid output has been placed in '+valid_filename)
def test_url(self): url_files = os.listdir(test_folder_name + 'url/') i = 0 while (i < len(url_files)): f = url_files[i] filename, extension = os.path.splitext(f) if (extension != ''): url_files.remove(f) else: i = i + 1 for f in url_files: input_file = open(test_folder_name + 'url/' + f, 'r') url = input_file.readline() input_file.close() output_filename = test_folder_name + 'url/' + f + '.cnxml' valid_filename = test_folder_name + 'url/' + f + '.cnxml' output_filename = test_folder_name + 'url/' + f + '.tmp' diff_filename = test_folder_name + 'url/' + f + '.diff' err_filename = test_folder_name + 'url/' + f + '.err' import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) output = open(output_filename, 'w') output.write(cnxml) output.close() remove_ids(output_filename) process = subprocess.Popen( ['diff', valid_filename, output_filename], shell=False, stdout=subprocess.PIPE) std_output = process.communicate() if (std_output[0] != None and len(std_output[0]) != 0): diff_output = open(diff_filename, 'w') diff_output.write(std_output[0]) diff_output.close() print( 'Differences in the testing of ' + f + ', information on those differences has been placed in ' + diff_filename) elif (std_output[1] != None and len(std_output[1]) != 0): err_output = open(err_filename, 'w') err_output.write(std_output[1]) err_output.close() print( 'Error(s) occurred while attempting to test for differences in CNXML output of ' + f + ', information on these errors are in ' + err_filename) except urllib2.URLError, e: print('URL ' + url + ' could not be opened') quit()
def choose_view(request): check_login(request) templatePath = 'templates/choose.pt' form = Form(request, schema=UploadSchema) field_list = [('upload', 'File')] # clear the session if 'transformerror' in request.session: del request.session['transformerror'] if 'title' in request.session: del request.session['title'] # Check for successful form completion if form.validate(): try: # Catch-all exception block # Create a directory to do the conversions now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') # TODO: This has a good chance of being unique, but even so... temp_dir_name = '%s-%s' % (request.session['username'], now_string) save_dir = os.path.join( request.registry.settings['transform_dir'], temp_dir_name ) os.mkdir(save_dir) # Keep the info we need for next uploads. Note that this # might kill the ability to do multiple tabs in parallel, # unless it gets offloaded onto the form again. request.session['upload_dir'] = temp_dir_name if form.data['upload'] is not None: request.session['filename'] = form.data['upload'].filename # Google Docs Conversion # if we have a Google Docs ID and Access token. if form.data['gdocs_resource_id']: gdocs_resource_id = form.data['gdocs_resource_id'] gdocs_access_token = form.data['gdocs_access_token'] form.data['gdocs_resource_id'] = None form.data['gdocs_access_token'] = None (request.session['title'], request.session['filename']) = \ process_gdocs_resource(save_dir, \ gdocs_resource_id, \ gdocs_access_token) # HTML URL Import: elif form.data.get('url_text'): url = form.data['url_text'] form.data['url_text'] = None # Build a regex for Google Docs URLs regex = re.compile("^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r: gdocs_resource_id = r.groups()[0] (request.session['title'], request.session['filename']) = \ process_gdocs_resource(save_dir, "document:" + gdocs_resource_id) else: # download html: #html = urllib2.urlopen(url).read() # Simple urlopen() will fail on mediawiki websites like e.g. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except urllib2.URLError, e: request['errors'] = ['The URL %s could not be opened' %url,] response = { 'form': FormRenderer(form), } return render_to_response(templatePath, response, request=request) # Office, CNXML-ZIP or LaTeX-ZIP file else: # Save the original file so that we can convert, plus keep it. original_filename = os.path.join( save_dir, form.data['upload'].filename.replace(os.sep, '_')) saved_file = open(original_filename, 'wb') input_file = form.data['upload'].file shutil.copyfileobj(input_file, saved_file) saved_file.close() input_file.close() form.data['upload'] = None # Check if it is a ZIP file with at least index.cnxml or a LaTeX file in it try: zip_archive = zipfile.ZipFile(original_filename, 'r') is_zip_archive = ('index.cnxml' in zip_archive.namelist()) # Do we have a latex file? if not is_zip_archive: # incoming latex.zip must contain a latex.tex file, where "latex" is the base name. (latex_head, latex_tail) = os.path.split(original_filename) (latex_root, latex_ext) = os.path.splitext(latex_tail) latex_basename = latex_root latex_filename = latex_basename + '.tex' is_latex_archive = (latex_filename in zip_archive.namelist()) except zipfile.BadZipfile: is_zip_archive = False is_latex_archive = False # ZIP package from previous conversion if is_zip_archive: # Unzip into transform directory zip_archive.extractall(path=save_dir) # Rename ZIP file so that the user can download it again os.rename(original_filename, os.path.join(save_dir, 'upload.zip')) # Read CNXML with open(os.path.join(save_dir, 'index.cnxml'), 'rt') as fp: cnxml = fp.read() # Convert the CNXML to XHTML for preview html = cnxml_to_htmlpreview(cnxml) with open(os.path.join(save_dir, 'index.xhtml'), 'w') as index: index.write(html) cnxml = clean_cnxml(cnxml) validate_cnxml(cnxml) # LaTeX elif is_latex_archive: f = open(original_filename) latex_archive = f.read() # LaTeX 2 CNXML transformation cnxml, objects = latex_to_cnxml(latex_archive, original_filename) cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, objects.items()) validate_cnxml(cnxml) # OOo / MS Word Conversion else: # Convert from other office format to odt if needed odt_filename = original_filename filename, extension = os.path.splitext(original_filename) if(extension != '.odt'): odt_filename= '%s.odt' % filename command = '/usr/bin/soffice -headless -nologo -nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + escape_system(original_filename)[1:-1] + ',' + odt_filename + ')"' os.system(command) try: fp = open(odt_filename, 'r') fp.close() except IOError as io: raise ConversionError("%s not found" % original_filename) # Convert and save all the resulting files. tree, files, errors = transform(odt_filename) cnxml = clean_cnxml(etree.tostring(tree)) save_cnxml(save_dir, cnxml, files.items()) # now validate with jing validate_cnxml(cnxml)
output.close() remove_ids(valid_filename) elif (extension == '.html'): print('Assuming this is a file containing a URL') f = filename input_file = open(f, 'r') name, extension = os.path.splitext(filename) valid_filename = name + '.cnxml' try: html = input_file.read() input_file.close() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=filename) cnxml = clean_cnxml(cnxml) output = open(valid_filename, 'w') output.write(cnxml) output.close() remove_ids(valid_filename) except urllib2.URLError, e: print('URL ' + url + ' could not be opened') quit() print('Done. Valid output has been placed in ' + valid_filename)
def process(self, form): try: url = form.data['url_text'] # Build a regex for Google Docs URLs regex = re.compile( "^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/") r = regex.search(url) # Take special action for Google Docs URLs if r is not None: gdocs_resource_id = r.groups()[0] http = httplib2.Http() http.follow_redirects = False try: resp, html = http.request( 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % gdocs_resource_id) resp2, kix = http.request( 'https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=kix' % gdocs_resource_id) except HttpError: pass else: # Check that status was OK, google docs sends a redirect to a login # page if not. if resp.status / 100 == 2: # Get the title title = 'Untitled Google Document' # Process it P = GoogleDocProcessor(self.request) return P.process_gdocs_resource(html, title, form, kix) self.request.session.flash('Failed to convert google document') return HTTPFound(location=self.request.route_url('choose')) else: # download html: # Simple urlopen() will fail on mediawiki websites eg. Wikipedia! import_opener = urllib2.build_opener() import_opener.addheaders = [('User-agent', 'Mozilla/5.0')] import_request = import_opener.open(url) html = import_request.read() # transformation cnxml, objects, html_title = htmlsoup_to_cnxml( html, bDownloadImages=True, base_or_source_url=url) self.request.session['title'] = html_title cnxml = clean_cnxml(cnxml) save_cnxml(self.save_dir, cnxml, objects.items()) # Keep the info we need for next uploads. Note that # this might kill the ability to do multiple tabs in # parallel, unless it gets offloaded onto the form # again. self.request.session['filename'] = "HTML Document" validate_cnxml(cnxml) except ConversionError as e: return render_conversionerror(self.request, e.msg) except Exception: tb = traceback.format_exc() self.write_traceback_to_zipfile(tb, form) templatePath = 'templates/error.pt' response = {'traceback': tb} if ('title' in self.request.session): del self.request.session['title'] return render_to_response(templatePath, response, request=self.request) self.request.session.flash(self.message) return HTTPFound(location=self.request.route_url(self.nextStep()))