def main(): # keep sure Java is installed (needed for Jing) if not java_installed(): print "ERROR: Could not find Java. Please keep sure that Java is installed and available." exit(1) # delete the contents of the testbed folder delete_all_contents_of_folder(TESTBED_OUTPUT_DIR) # login to gdocs and get a client object gd_client = getAuthorizedGoogleDocsClient() # open file with GDocs public documents URLs (<- the testbed for GDocs) url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE)) for url in url_file: if not url.startswith('#'): # ignore comments # check if we really have a gdocs document with an ID # Get the ID out of the URL with regular expression match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url) if match_doc_id: doc_id = match_doc_id.group(1) # create a sub directory named like the ID doc_output_dir = os.path.join(TESTBED_OUTPUT_DIR, doc_id) try: os.mkdir(doc_output_dir) except OSError: pass # If subdirectory already exists do nothing doc_key = 'document:' + doc_id print_status('Getting ' + doc_key) # get the Google Docs Entry gd_entry = gd_client.GetDoc(doc_key) # Get the contents of the document gd_entry_url = gd_entry.content.src # should be the same as url, but better ask API for url html = gd_client.get_file_content(gd_entry_url) # requires a URL # write testbed source html output html_filename = os.path.join(doc_output_dir, doc_id +'.htm') html_file = open(html_filename, 'w') try: html_file.write(html) html_file.flush() finally: html_file.close() print_status('Transforming and get images from %s' % doc_key) # transformation and get images cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True) # write testbed images for image_filename, image in objects.iteritems(): image_filename = os.path.join(doc_output_dir, image_filename) image_file = open(image_filename, 'wb') # write binary, important! try: image_file.write(image) image_file.flush() finally: image_file.close() # write testbed CNXML output cnxml_filename = os.path.join(doc_output_dir, doc_id + '.xml') cnxml_file = open(cnxml_filename, 'w') try: cnxml_file.write(cnxml) cnxml_file.flush() finally: cnxml_file.close() # validate CNXML output with Jing Relax NG if len(sys.argv) > 1 and sys.argv[1] == '-noval': print_status('Validation skipped') else: print_status('Validating %s' % doc_key) jing_log_filename = os.path.join(doc_output_dir, doc_id + '.log') jing_validate_file(cnxml_filename, jing_log_filename) print_status('Finished!')
def main(): # keep sure Java is installed (needed for Jing) if not java_installed(): print "ERROR: Could not find Java. Please keep sure that Java is installed and available." exit(1) # delete the contents of the testbed folder delete_all_contents_of_folder(TESTBED_OUTPUT_DIR) # open file with GDocs public documents URLs (<- the testbed for GDocs) url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE)) for url in url_file: if not url.startswith('#'): # ignore comments # check if we really have a gdocs document with an ID # Get the ID out of the URL with regular expression match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url) if match_doc_id: doc_id = match_doc_id.group(1) # create a sub directory named like the ID doc_output_dir = os.path.join(TESTBED_OUTPUT_DIR, doc_id) try: os.mkdir(doc_output_dir) except OSError: pass # If subdirectory already exists do nothing doc_key = 'document:' + doc_id print_status('Getting ' + doc_key) # get the Google Docs by fetching the HTML directly http = httplib2.Http() http.follow_redirects = False try: plain_html_url = 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % doc_id print_status('URL: ' + plain_html_url) resp, html = http.request(plain_html_url) except HttpError: print "Error: Failed to download Google Docs HTML" try: kix_url = 'https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=kix' % doc_id print_status('URL: ' + kix_url) resp, kix = http.request(kix_url) except HttpError: print "Error: Failed to download Google Docs Kix" # write testbed source html output html_filename = os.path.join(doc_output_dir, doc_id +'.htm') html_file = open(html_filename, 'w') try: html_file.write(html) html_file.flush() finally: html_file.close() print_status('Transforming and get images from %s' % doc_key) # transformation and get images cnxml, objects = gdocs_to_cnxml(html, kixcontent=kix, bDownloadImages=True) # write testbed images for image_filename, image in objects.iteritems(): image_filename = os.path.join(doc_output_dir, image_filename) image_file = open(image_filename, 'wb') # write binary, important! try: image_file.write(image) image_file.flush() finally: image_file.close() # write testbed CNXML output cnxml_filename = os.path.join(doc_output_dir, doc_id + '.xml') cnxml_file = open(cnxml_filename, 'w') try: cnxml_file.write(cnxml) cnxml_file.flush() finally: cnxml_file.close() # validate CNXML output with Jing Relax NG if len(sys.argv) > 1 and sys.argv[1] == '-noval': print_status('Validation skipped') else: print_status('Validating %s' % doc_key) jing_log_filename = os.path.join(doc_output_dir, doc_id + '.log') jing_validate_file(cnxml_filename, jing_log_filename) print_status('Finished!!!')