Python clean_cnxml Examples, utils.clean_cnxml Python Examples

Example #1

0

Show file

File: views.py Project: yingjin/oerpub.rhaptoslabs.swordpushweb

def process_gdocs_resource(save_dir, gdocs_resource_id, gdocs_access_token=None):

    # login to gdocs and get a client object
    gd_client = getAuthorizedGoogleDocsClient()

    # Create a AuthSub Token based on gdocs_access_token String
    auth_sub_token = gdata.gauth.AuthSubToken(gdocs_access_token) \
                     if gdocs_access_token \
                     else None

    # get the Google Docs Entry
    gd_entry = gd_client.GetDoc(gdocs_resource_id, None, auth_sub_token)

    # Get the contents of the document
    gd_entry_url = gd_entry.content.src
    html = gd_client.get_file_content(gd_entry_url, auth_sub_token)

    # Transformation and get images
    cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)

    cnxml = clean_cnxml(cnxml)
    save_cnxml(save_dir, cnxml, objects.items())

    validate_cnxml(cnxml)

    # Return the title and filename.  Old comment states
    # that returning this filename might kill the ability to
    # do multiple tabs in parallel, unless it gets offloaded
    # onto the form again.
    return (gd_entry.title.text, "Google Document")

Example #2

0

Show file

    def test_latex(self):
        latex_files = os.listdir(test_folder_name + 'latex/')
        i = 0

        while (i < len(latex_files)):
            f = latex_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != ''):
                latex_files.remove(f)
            else:
                i = i + 1

        for f in latex_files:
            original_filename = test_folder_name + 'latex/' + f
            filename, extension = os.path.splitext(original_filename)

            valid_filename = filename + '.cnxml'
            output_filename = filename + '.tmp'
            diff_filename = filename + '.diff'
            err_filename = filename + '.err'

            fp = open(original_filename, 'r')
            latex_archive = fp.read()

            # LaTeX 2 CNXML transformation
            cnxml, objects = latex_to_cnxml(latex_archive, original_filename)

            cnxml = clean_cnxml(cnxml)
            save_cnxml(save_dir, cnxml, objects.items())
            validate_cnxml(cnxml)

            fp.close()

            output = open(output_filename, 'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)

            process = subprocess.Popen(
                ['diff', valid_filename, output_filename],
                shell=False,
                stdout=subprocess.PIPE)
            std_output = process.communicate()

            if (std_output[0] != None and len(std_output[0]) != 0):
                diff_output = open(diff_filename, 'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print(
                    'Differences in the testing of ' + f +
                    ', information on those differences has been placed in ' +
                    diff_filename)
            elif (std_output[1] != None and len(std_output[1]) != 0):
                err_output = open(err_filename, 'w')
                err_output.write(std_output[1])
                err_output.close()
                print(
                    'Error(s) occurred while attempting to test for differences in CNXML output of '
                    + f + ', information on these errors are in ' +
                    err_filename)

Example #3

0

Show file

File: test_conversion.py Project: oerpub/oerpub.remix

    def test_url(self):
        url_files=os.listdir(test_folder_name+'url/')
        i=0

        while(i < len(url_files)):
            f=url_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != ''):
                url_files.remove(f)
            else:
                i=i+1

        for f in url_files:
            input_file=open(test_folder_name+'url/'+f,'r')
            url=input_file.readline()
            input_file.close()

            output_filename=test_folder_name+'url/'+f+'.cnxml'
            valid_filename=test_folder_name+'url/'+f+'.cnxml'
            output_filename=test_folder_name+'url/'+f+'.tmp'
            diff_filename = test_folder_name+'url/'+f+'.diff'
            err_filename = test_folder_name+'url/'+f+'.err'

            import_opener = urllib2.build_opener()
            import_opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            try:
                import_request = import_opener.open(url)
                html = import_request.read()

                # transformation            
                cnxml, objects, html_title = htmlsoup_to_cnxml(
                html, bDownloadImages=True, base_or_source_url=url)

                cnxml = clean_cnxml(cnxml)
                validate_cnxml(cnxml)


                output=open(output_filename,'w')
                output.write(cnxml)
                output.close()
                remove_ids(output_filename)

                process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
                std_output = process.communicate()

                if(std_output[0] != None and len(std_output[0]) != 0):
                    diff_output=open(diff_filename,'w')
                    diff_output.write(std_output[0])
                    diff_output.close()
                    print('Differences in the testing of '+f+', information on those differences has been placed in '+diff_filename)
                elif(std_output[1] != None and len(std_output[1]) != 0):
                    err_output=open(err_filename,'w')
                    err_output.write(std_output[1])
                    err_output.close()
                    print('Error(s) occurred while attempting to test for differences in CNXML output of '+f+', information on these errors are in '+err_filename)

            except urllib2.URLError, e:
                print('URL '+url+' could not be opened')
                quit()

Example #4

0

Show file

    def test_odt(self):
        odt_files = os.listdir(test_folder_name + 'odt/')
        i = 0
        # Find only .odt files in the testing folder for odt
        while (i < len(odt_files)):
            f = odt_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != '.odt'):
                odt_files.remove(f)
            else:
                i = i + 1

        for f in odt_files:
            original_filename = test_folder_name + 'odt/' + f
            filename, extension = os.path.splitext(original_filename)

            valid_filename = filename + '.cnxml'
            output_filename = filename + '.tmp'
            odt_filename = original_filename
            diff_filename = filename + '.diff'
            err_filename = filename + '.err'

            try:
                open(valid_filename, 'r')
            except IOError as e:
                print('Missing valid file (' + valid_filename +
                      ') for testing ' + original_filename)
                return

            tree, files, errors = transform(odt_filename)
            cnxml = clean_cnxml(etree.tostring(tree))
            validate_cnxml(cnxml)
            output = open(output_filename, 'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            process = subprocess.Popen(
                ['diff', valid_filename, output_filename],
                shell=False,
                stdout=subprocess.PIPE)
            std_output = process.communicate()

            if (std_output[0] != None and len(std_output[0]) != 0):
                diff_output = open(diff_filename, 'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print(
                    'Differences in the testing of ' + original_filename +
                    ', information on those differences has been placed in ' +
                    diff_filename)
            elif (std_output[1] != None and len(std_output[1]) != 0):
                err_output = open(err_filename, 'w')
                err_output.write(std_output[1])
                err_output.close()
                print(
                    'Error(s) occurred while attempting to test for differences in CNXML output of '
                    + original_filename +
                    ', information on these errors are in ' + err_filename)

Example #5

0

Show file

File: cnxml.py Project: oerpub/oerpub.remix

def cnxml_view(request):
    check_login(request)
    form = Form(request, schema=CnxmlSchema)
    save_dir = os.path.join(request.registry.settings['transform_dir'], request.session['upload_dir'])
    cnxml_filename = os.path.join(save_dir, 'index.cnxml')
    transformerror = request.session.get('transformerror')

    # Check for successful form completion
    if 'cnxml' in request.POST and form.validate():
        cnxml = form.data['cnxml']

        # Keep sure we use the standard python ascii string and encode Unicode to xml character mappings
        if isinstance(cnxml, unicode):
            cnxml = cnxml.encode('ascii', 'xmlcharrefreplace')        

        # get the list of files from upload.zip if it exists
        files = []
        zip_filename = os.path.join(save_dir, 'upload.zip')
        if os.path.exists(zip_filename):
            zip_archive = zipfile.ZipFile(zip_filename, 'r')
            for filename in zip_archive.namelist():
                if filename == 'index.cnxml':
                    continue
                fp = zip_archive.open(filename, 'r')
                files.append((filename, fp.read()))
                fp.close()

        try:
            files = get_files_from_zipfile(os.path.join(save_dir, 'upload.zip'))
            save_cnxml(save_dir, cnxml, files)
            validate_cnxml(cnxml)
        except ConversionError as e:
            return render_conversionerror(request, e.msg)

        # Return to preview
        return HTTPFound(location=request.route_url('preview'), request=request)

    # Read CNXML
    try:
        with open(cnxml_filename, 'rt') as fp:
            cnxml = fp.read()
    except IOError:
        raise HTTPNotFound('index.cnxml not found')

    # Clean CNXML
    cnxml = clean_cnxml(cnxml)
    cnxml = cnxml.decode('utf-8')
    cnxml = unicode(cnxml)

    return {
        'codemirror': True,
        'form': FormRenderer(form),
        'cnxml': cnxml,
        'transformerror': transformerror,
    }

Example #6

0

Show file

File: test_conversion.py Project: oerpub/oerpub.remix

    def test_doc(self):
        doc_files=os.listdir(test_folder_name+'doc/')
        i=0
# Find only .odt files in the testing folder for odt
        while(i < len(doc_files)):
            f=doc_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != '.doc'):
                doc_files.remove(f)
            else:
                i=i+1

        for f in doc_files:
            original_filename=test_folder_name+'doc/'+f
            filename, extension = os.path.splitext(original_filename)

            valid_filename=filename+'.cnxml'
            output_filename=filename+'.tmp'
            doc_filename = original_filename
            diff_filename = filename+'.diff'
            err_filename = filename+'.err'

            odt_filename= filename+'.odt'
            command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+original_filename + ',' + os.getcwd()+'/'+odt_filename + ')"'
            os.system(command)

            try:
                open(valid_filename, 'r')
            except IOError as e:
                print('Missing valid file ('+valid_filename+') for testing '+original_filename)
                return

            tree, files, errors = transform(odt_filename)
            cnxml = clean_cnxml(etree.tostring(tree))
            validate_cnxml(cnxml)
            output=open(output_filename,'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
            std_output = process.communicate()

            if(std_output[0] != None and len(std_output[0]) != 0):
                diff_output=open(diff_filename,'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print('Differences in the testing of '+original_filename+', information on those differences has been placed in '+diff_filename)
            elif(std_output[1] != None and len(std_output[1]) != 0):
                err_output=open(err_filename,'w')
                err_output.write(std_output[1])
                err_output.close()
                print('Error(s) occurred while attempting to test for differences in CNXML output of '+original_filename+', information on these errors are in '+err_filename)

Example #7

0

Show file

File: test_conversion.py Project: oerpub/oerpub.remix

    def test_latex(self):
        latex_files=os.listdir(test_folder_name+'latex/')
        i=0
        
        while(i < len(latex_files)):
            f=latex_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != ''):
                latex_files.remove(f)
            else:
                i=i+1

        for f in latex_files:
            original_filename=test_folder_name+'latex/'+f
            filename, extension = os.path.splitext(original_filename)

            valid_filename=filename+'.cnxml'
            output_filename=filename+'.tmp'
            diff_filename = filename+'.diff'
            err_filename = filename+'.err'

            fp=open(original_filename, 'r')
            latex_archive = fp.read()

            # LaTeX 2 CNXML transformation
            cnxml, objects = latex_to_cnxml(latex_archive, original_filename)

            cnxml = clean_cnxml(cnxml)
            save_cnxml(save_dir, cnxml, objects.items())
            validate_cnxml(cnxml)

            fp.close()

            output=open(output_filename,'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)

            process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
            std_output = process.communicate()

            if(std_output[0] != None and len(std_output[0]) != 0):
                diff_output=open(diff_filename,'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print('Differences in the testing of '+f+', information on those differences has been placed in '+diff_filename)
            elif(std_output[1] != None and len(std_output[1]) != 0):
                err_output=open(err_filename,'w')
                err_output.write(std_output[1])
                err_output.close()
                print('Error(s) occurred while attempting to test for differences in CNXML output of '+f+', information on these errors are in '+err_filename)

Example #8

0

Show file

File: test_conversion.py Project: oerpub/oerpub.remix

    def test_odt(self):
        odt_files=os.listdir(test_folder_name+'odt/')
        i=0
# Find only .odt files in the testing folder for odt
        while(i < len(odt_files)):
            f=odt_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != '.odt'):
                odt_files.remove(f)
            else:
                i=i+1

        for f in odt_files:
            original_filename=test_folder_name+'odt/'+f
            filename, extension = os.path.splitext(original_filename)

            valid_filename=filename+'.cnxml'
            output_filename=filename+'.tmp'
            odt_filename = original_filename
            diff_filename = filename+'.diff'
            err_filename = filename+'.err'

            try:
                open(valid_filename, 'r')
            except IOError as e:
                print('Missing valid file ('+valid_filename+') for testing '+original_filename)
                return

            tree, files, errors = transform(odt_filename)
            cnxml = clean_cnxml(etree.tostring(tree))
            validate_cnxml(cnxml)
            output=open(output_filename,'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
            std_output = process.communicate()

            if(std_output[0] != None and len(std_output[0]) != 0):
                diff_output=open(diff_filename,'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print('Differences in the testing of '+original_filename+', information on those differences has been placed in '+diff_filename)
            elif(std_output[1] != None and len(std_output[1]) != 0):
                err_output=open(err_filename,'w')
                err_output.write(std_output[1])
                err_output.close()
                print('Error(s) occurred while attempting to test for differences in CNXML output of '+original_filename+', information on these errors are in '+err_filename)

Example #9

0

Show file

filename = sys.argv[1]
name, extension = os.path.splitext(filename)
if (extension == '.odt' or extension == '.doc'):

    if (extension == '.doc'):
        doc_folder = os.getcwd() + '/' + os.path.dirname(name)
        os.system('./converters/doc2odt -o ' + doc_folder + ' ' + os.getcwd() +
                  '/' + filename)
        #command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+filename + ',' + os.getcwd()+'/'+name+'.odt' + ')"'
        #os.system(command)
        filename = name + '.odt'

    valid_filename = name + '.cnxml'
    tree, files, errors = transform(filename)
    cnxml = clean_cnxml(etree.tostring(tree))
    output = open(valid_filename, 'w')
    output.write(cnxml)
    output.close()
    remove_ids(valid_filename)
    if (extension == '.doc'):
        os.remove(os.getcwd() + '/' + name + '.odt')

elif (extension == '.tex'):
    valid_filename = name + '.cnxml'
    fp = open(filename, 'r')
    latex_archive = fp.read()
    fp.close()

    # LaTeX 2 CNXML transformation
    cnxml, objects = latex_to_cnxml(latex_archive, filename)

Example #10

0

Show file

File: test_conversion.py Project: oerpub/oerpub.remix

    def test_gdocs(self):
        have_test_file = False
        try:
            fp = open('./test_files/gdocs/test_files')
            fp.close()
            have_test_file = True
        except:
            print('No gdocs test file')

        doc_files=os.listdir(test_folder_name+'doc/')
        rids = [ ]
        i=0
        while(i < len(doc_files)):
            f=doc_files[i]
            filename, extension = os.path.splitext(f)
            if(extension != '.doc'):
                doc_files.remove(f)
            else:
                i=i+1
        for d in doc_files:
            try:
                just_filename=os.path.basename(d)
                just_filename, extension = os.path.splitext(just_filename)
                rid = upload_doc(test_folder_name+'doc/'+d, 'application/msword',just_filename)
                rids.append(rid)
            except KeyboardInterrupt:
                exit()
            except :
                print('Error uploading '+just_filename+' to gdocs')

        if(have_test_file):
            fp = open('./test_files/gdocs/test_files')
            for url in fp:
                if(url[0] == '#'):
                    continue
                match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
                if match_doc_id:
                    rids.append('document:'+match_doc_id.group(1))
            fp.close()
                
        count = 0
        for rid in rids:
            if(count < len(doc_files)):
                filename = os.path.basename(doc_files[count])
                filename,ext = os.path.splitext(filename)
            else:
                filename = rid[9:]

            valid_filename='./test_files/gdocs/'+filename+'.cnxml'
            output_filename='./test_files/gdocs/'+filename+'.tmp'
            diff_filename = './test_files/gdocs/'+filename+'.diff'
            err_filename = './test_files/gdocs/'+filename+'.err'

            gdoc_url = construct_url(rid[9:])
            rid,original_title = get_gdoc(gdoc_url, './test_files/gdocs')
            html_filename = './test_files/gdocs/'+rid[9:]+'.htm'
            html_file = open(html_filename, 'r')
            try:
                html = html_file.read()
                html_file.flush()
            finally:
                html_file.close()
            cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
            cnxml = clean_cnxml(cnxml)
            validate_cnxml(cnxml)

            output=open(output_filename,'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            os.remove('./test_files/gdocs/'+rid[9:]+'.htm')

            process = subprocess.Popen(['diff',valid_filename,output_filename], shell=False, stdout=subprocess.PIPE)
            std_output = process.communicate()

            if(std_output[0] != None and len(std_output[0]) != 0):
                diff_output=open(diff_filename,'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print('Differences in the testing of gdoc '+filename+', information on those differences has been placed in '+diff_filename)
            elif(std_output[1] != None and len(std_output[1]) != 0):
                err_output=open(err_filename,'w')
                err_output.write(std_output[1])
                err_output.close()
                print('Error(s) occurred while attempting to test for differences in CNXML output of gdoc '+filename+', information on these errors are in '+err_filename)
            count = count + 1

Example #11

0

Show file

File: views.py Project: yingjin/oerpub.rhaptoslabs.swordpushweb

def choose_view(request):
    check_login(request)

    templatePath = 'templates/choose.pt'

    form = Form(request, schema=UploadSchema)
    field_list = [('upload', 'File')]

    # clear the session
    if 'transformerror' in request.session:
        del request.session['transformerror']
    if 'title' in request.session:
        del request.session['title']

    # Check for successful form completion
    if form.validate():
        try: # Catch-all exception block
            # Create a directory to do the conversions
            now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
            # TODO: This has a good chance of being unique, but even so...
            temp_dir_name = '%s-%s' % (request.session['username'], now_string)
            save_dir = os.path.join(
                request.registry.settings['transform_dir'],
                temp_dir_name
                )
            os.mkdir(save_dir)

            # Keep the info we need for next uploads.  Note that this
            # might kill the ability to do multiple tabs in parallel,
            # unless it gets offloaded onto the form again.
            request.session['upload_dir'] = temp_dir_name
            if form.data['upload'] is not None:
                request.session['filename'] = form.data['upload'].filename

            # Google Docs Conversion
            # if we have a Google Docs ID and Access token.
            if form.data['gdocs_resource_id']:
                gdocs_resource_id = form.data['gdocs_resource_id']
                gdocs_access_token = form.data['gdocs_access_token']

                form.data['gdocs_resource_id'] = None
                form.data['gdocs_access_token'] = None
                
                (request.session['title'], request.session['filename']) = \
                    process_gdocs_resource(save_dir, \
                                           gdocs_resource_id, \
                                           gdocs_access_token)

            # HTML URL Import:
            elif form.data.get('url_text'):
                url = form.data['url_text']

                form.data['url_text'] = None

                # Build a regex for Google Docs URLs
                regex = re.compile("^https:\/\/docs\.google\.com\/.*document\/[^\/]\/([^\/]+)\/")
                r = regex.search(url)

                # Take special action for Google Docs URLs
                if r:
                    gdocs_resource_id = r.groups()[0]
                    (request.session['title'], request.session['filename']) = \
                        process_gdocs_resource(save_dir, "document:" + gdocs_resource_id)
                else:
                    # download html:
                    #html = urllib2.urlopen(url).read() 
                    # Simple urlopen() will fail on mediawiki websites like e.g. Wikipedia!
                    import_opener = urllib2.build_opener()
                    import_opener.addheaders = [('User-agent', 'Mozilla/5.0')]
                    try:
                        import_request = import_opener.open(url)
                        html = import_request.read()

                        # transformation            
                        cnxml, objects, html_title = htmlsoup_to_cnxml(
                        html, bDownloadImages=True, base_or_source_url=url)
                        request.session['title'] = html_title

                        cnxml = clean_cnxml(cnxml)
                        save_cnxml(save_dir, cnxml, objects.items())

                        # Keep the info we need for next uploads.  Note that
                        # this might kill the ability to do multiple tabs in
                        # parallel, unless it gets offloaded onto the form
                        # again.
                        request.session['filename'] = "HTML Document"

                        validate_cnxml(cnxml)

                    except urllib2.URLError, e:
                        request['errors'] = ['The URL %s could not be opened' %url,]
                        response = {
                            'form': FormRenderer(form),
                            }
                        return render_to_response(templatePath, response, request=request)

            # Office, CNXML-ZIP or LaTeX-ZIP file
            else:
                # Save the original file so that we can convert, plus keep it.
                original_filename = os.path.join(
                    save_dir,
                    form.data['upload'].filename.replace(os.sep, '_'))
                saved_file = open(original_filename, 'wb')
                input_file = form.data['upload'].file
                shutil.copyfileobj(input_file, saved_file)
                saved_file.close()
                input_file.close()

                form.data['upload'] = None

                # Check if it is a ZIP file with at least index.cnxml or a LaTeX file in it
                try:
                    zip_archive = zipfile.ZipFile(original_filename, 'r')
                    is_zip_archive = ('index.cnxml' in zip_archive.namelist())
                    
                    # Do we have a latex file?
                    if not is_zip_archive:
                        # incoming latex.zip must contain a latex.tex file, where "latex" is the base name.
                        (latex_head, latex_tail) = os.path.split(original_filename)
                        (latex_root, latex_ext)  = os.path.splitext(latex_tail)
                        latex_basename = latex_root
                        latex_filename = latex_basename + '.tex'
                        is_latex_archive = (latex_filename in zip_archive.namelist())

                except zipfile.BadZipfile:
                    is_zip_archive = False
                    is_latex_archive = False

                # ZIP package from previous conversion
                if is_zip_archive:
                    # Unzip into transform directory
                    zip_archive.extractall(path=save_dir)

                    # Rename ZIP file so that the user can download it again
                    os.rename(original_filename, os.path.join(save_dir, 'upload.zip'))

                    # Read CNXML
                    with open(os.path.join(save_dir, 'index.cnxml'), 'rt') as fp:
                        cnxml = fp.read()

                    # Convert the CNXML to XHTML for preview
                    html = cnxml_to_htmlpreview(cnxml)
                    with open(os.path.join(save_dir, 'index.xhtml'), 'w') as index:
                        index.write(html)

                    cnxml = clean_cnxml(cnxml)
                    validate_cnxml(cnxml)
                
                # LaTeX
                elif is_latex_archive:
                    f = open(original_filename)
                    latex_archive = f.read()

                    # LaTeX 2 CNXML transformation
                    cnxml, objects = latex_to_cnxml(latex_archive, original_filename)

                    cnxml = clean_cnxml(cnxml)
                    save_cnxml(save_dir, cnxml, objects.items())
                    validate_cnxml(cnxml)

                # OOo / MS Word Conversion
                else:
                    # Convert from other office format to odt if needed
                    odt_filename = original_filename
                    filename, extension = os.path.splitext(original_filename)
                    if(extension != '.odt'):
                        odt_filename= '%s.odt' % filename
                        command = '/usr/bin/soffice -headless -nologo -nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + escape_system(original_filename)[1:-1] + ',' + odt_filename + ')"'
                        os.system(command)
                        try:
                            fp = open(odt_filename, 'r')
                            fp.close()
                        except IOError as io:
                            raise ConversionError("%s not found" %
                                                  original_filename)
                    # Convert and save all the resulting files.

                    tree, files, errors = transform(odt_filename)
                    cnxml = clean_cnxml(etree.tostring(tree))
                    save_cnxml(save_dir, cnxml, files.items())

                    # now validate with jing
                    validate_cnxml(cnxml)

Example #12

0

Show file

File: convert_gdocs_cnxml.py Project: oerpub/oerpub.remix

from utils import clean_cnxml, escape_system
from test_conversion import validate_cnxml, remove_ids

url = 'https://docs.google.com/document/d/1tiZR1fhBl3ZQ_UaQ5sRDA3gSs_7LjgtTITkBAGjuTpI/edit'
#url='https://docs.google.com/document/d/1Gw9j1J-_d5YQoq6SIc3Az2hiVlwtvVcJkXfYKDR_zBM/edit'

match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
rid = 'document:' + match_doc_id.group(1)

print(rid)
filename = rid[9:]
valid_filename = 'valid.cnxml'
gdoc_url = construct_url(rid[9:])
print(gdoc_url)
rid, original_title = get_gdoc(url, './')
html_filename = './' + rid[9:] + '.htm'
html_file = open(html_filename, 'r')
try:
    html = html_file.read()
    html_file.flush()
finally:
    html_file.close()

cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
cnxml = clean_cnxml(cnxml)
validate_cnxml(cnxml)

output = open(valid_filename, 'w')
output.write(cnxml)
output.close()

Example #13

0

Show file

File: generate_valid_cnxml.py Project: oerpub/oerpub.remix

filename=sys.argv[1]
name, extension = os.path.splitext(filename)
if(extension == '.odt' or extension == '.doc'):

    if(extension == '.doc'):
        doc_folder = os.getcwd()+'/'+os.path.dirname(name)
        os.system('./converters/doc2odt -o '+doc_folder+' '+os.getcwd()+'/'+filename)
        #command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd()+'/'+filename + ',' + os.getcwd()+'/'+name+'.odt' + ')"'
        #os.system(command)
        filename=name+'.odt'


    valid_filename=name+'.cnxml'
    tree, files, errors = transform(filename)
    cnxml = clean_cnxml(etree.tostring(tree))
    output=open(valid_filename,'w')
    output.write(cnxml)
    output.close()
    remove_ids(valid_filename)
    if(extension == '.doc'):
        os.remove(os.getcwd()+'/'+name+'.odt')

elif(extension == '.tex'):
    valid_filename=name+'.cnxml'
    fp = open(filename, 'r')
    latex_archive = fp.read()
    fp.close()

    # LaTeX 2 CNXML transformation
    cnxml, objects = latex_to_cnxml(latex_archive, filename)

Example #14

0

Show file

File: enhance.py Project: oerpub/oerpub.remix

def enhance(request):
    check_login(request)
    session = request.session
    google_resource_id = ""
    slideshare_id = ""
    embed_google = False
    embed_slideshare = False
    not_converted = True
    show_iframe = False
    form = Form(request, schema=QuestionAnswerSchema)
    validate_form = form.validate()
    print form.all_errors()
    if session.has_key('google-resource-id'):
        google_resource_id = session['google-resource-id']
    if session.has_key('slideshare_id'):
        slideshare_id = session['slideshare_id']
        if fetch_slideshow_status(slideshare_id) == "2":
            not_converted = False
            show_iframe = True



    if google_resource_id!="":
        embed_google = True
    if slideshare_id!="":
        embed_slideshare = True
    templatePath = "templates/google_ss_preview.pt"
    if validate_form:
        introductory_paragraphs = request.POST.get('introductory_paragraphs')
        question_count=0
        cnxml=session["cnxml"]+"""<content><section id="intro-section-title"><title id="introtitle">Introduction</title><para id="introduction-1">"""+introductory_paragraphs+"""</para></section><section id="slides-embed"><title id="slide-embed-title">View the slides</title><figure id="ss-embed-figure"><media id="slideshare-embed" alt="slideshare-embed"><iframe src="http://www.slideshare.net/slideshow/embed_code/"""+slideshare_id+"""" width="425" height="355" /></media></figure></section>"""        
        for i in range(1,6):
            form_question = request.POST.get('question-'+str(i))
            if form_question:                
                form_radio_answer = request.POST.get('radio-'+str(i)) #this give us something like 'answer-1-1'. so our solution is this
                question_count +=1                
                if question_count==1:
                    cnxml+="""<section id="test-section"><title>Test your knowledge</title>"""
                itemlist = ""
                for j in range(1,10):
                    try:
                        
                        form_all_answers = request.POST.get('answer-'+str(i)+'-'+str(j))
                        if form_all_answers:
                            itemlist +="<item>" + form_all_answers+"</item>"
                        
                    except:
                        print "No element found"
                
                if form_radio_answer:
                    solution = request.POST.get(form_radio_answer)
                    cnxml+="""<exercise id="exercise-"""+str(i)+""""><problem id="problem-"""+str(i)+""""><para id="para-"""+str(i)+"""">"""+str(form_question)+"""<list id="option-list-"""+str(i)+"""" list-type="enumerated" number-style="lower-alpha">"""+str(itemlist)+"""</list></para></problem>"""
                else:
                    print "ELESE CONDUITION OF radio"
                    solution = request.POST.get('answer-'+str(i)+'-1')
                    cnxml+="""<exercise id="exercise-"""+str(i)+""""><problem id="problem-"""+str(i)+""""><para id="para-"""+str(i)+"""">"""+str(form_question)+"""</para></problem>"""
                print "FORM RADIO ANSWER",form_radio_answer
                print "SOLUTION", solution                
                cnxml+=""" <solution id="solution-"""+str(i)+""""> <para id="solution-para-"""+str(i)+"""">"""+solution+"""</para></solution></exercise>"""
				
					
                """form_solution = request.POST.get('solution-'+str(i))
                all_post_data = {"data":{"options":form_options,"solution":form_solution,"question":form_question}}
                for question in all_post_data:
                    options = all_post_data[question]['options']
                    solution = all_post_data[question]['solution']
                    asked_question = all_post_data[question]['question']
                    optionlist=""
                    for option in options:
                        optionlist+="<item>"+option+"</item>"""
                    #cnxml+="""<exercise id="exercise-"""+str(j)+""""><problem id="problem-"""+str(j)+""""><para id="para-"""+str(j)+"""">"""+str(asked_question)+"""<list id="option-list-"""+str(j)+"""" list-type="enumerated" number-style="lower-alpha">"""+str(optionlist)+"""</list></para></problem>"""
                    #cnxml+=""" <solution id="solution-"""+str(j)+""""> <para id="solution-para-"""+str(j)+"""">"""+solution+"""</para></solution></exercise>"""
                    #j+=1
        metadata = session['metadata']
        if question_count>=1:
            cnxml += "</section></content></document>"
        else:
            cnxml += "</content></document>"
        workspaces = [(i['href'], i['title']) for i in session['login'].collections]
        metadata_entry = sword2cnx.MetaData(metadata)
        zipped_filepath = session['userfilepath']
        zip_archive = zipfile.ZipFile(zipped_filepath, 'w')
        zip_archive.writestr("index.cnxml",cnxml)
        zip_archive.close()
        conn = sword2cnx.Connection("http://cnx.org/sword/servicedocument",
                                    user_name=session['login'].username,
                                    user_pass=session['login'].password,
                                    always_authenticate=True,
                                    download_service_document=True)
        collections = [{'title': i.title, 'href': i.href}
                                  for i in sword2cnx.get_workspaces(conn)]
        session['login'].collections = collections
        workspaces = [(i['href'], i['title']) for i in session['login'].collections]
        session['workspaces'] = workspaces
        with open(zipped_filepath, 'rb') as zip_file:
            deposit_receipt = conn.create(
                col_iri = workspaces[0][0],
                metadata_entry = metadata_entry,
                payload = zip_file,
                filename = 'upload.zip',
                mimetype = 'application/zip',
                packaging = 'http://purl.org/net/sword/package/SimpleZip',
                in_progress = True)
        session['dr'] = deposit_receipt
        session['deposit_receipt'] = deposit_receipt.to_xml()
        soup = BeautifulSoup(deposit_receipt.to_xml())
        data = soup.find("link",rel="edit")
        edit_iri = data['href']
        session['edit_iri'] = edit_iri
        creator = soup.find('dcterms:creator')
        username = session['login'].username
        email = creator["oerdc:email"]
        url = "http://connexions-oerpub.appspot.com/"
        post_values = {"username":username,"email":email,"slideshow_id":slideshare_id}
        data = urllib.urlencode(post_values)
        google_req = urllib2.Request(url, data)
        google_response = urllib2.urlopen(google_req)
        now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        temp_dir_name = '%s-%s' % (request.session['login'].username, now_string)
        save_dir = os.path.join(request.registry.settings['transform_dir'],temp_dir_name)
        os.mkdir(save_dir)
        request.session['upload_dir'] = temp_dir_name
        cnxml = clean_cnxml(cnxml)
        save_cnxml(save_dir,cnxml,[])
        return HTTPFound(location=request.route_url('metadata'))
        
        
        #return HTTPFound(location=request.route_url('updatecnx'))


    response = {'form':FormRenderer(form),
                "slideshare_id":slideshare_id,
                "google_resource_id":google_resource_id,
                "embed_google":embed_google,
                "embed_slideshare":embed_slideshare,
                "not_converted": not_converted,
                "show_iframe":show_iframe}
    return render_to_response(templatePath, response, request=request)

Example #15

0

Show file

    def test_doc(self):
        doc_files = os.listdir(test_folder_name + 'doc/')
        i = 0
        # Find only .odt files in the testing folder for odt
        while (i < len(doc_files)):
            f = doc_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != '.doc'):
                doc_files.remove(f)
            else:
                i = i + 1

        for f in doc_files:
            original_filename = test_folder_name + 'doc/' + f
            filename, extension = os.path.splitext(original_filename)

            valid_filename = filename + '.cnxml'
            output_filename = filename + '.tmp'
            doc_filename = original_filename
            diff_filename = filename + '.diff'
            err_filename = filename + '.err'

            odt_filename = filename + '.odt'
            command = '/usr/bin/soffice --headless --nologo --nofirststartwizard "macro:///Standard.Module1.SaveAsOOO(' + os.getcwd(
            ) + '/' + original_filename + ',' + os.getcwd(
            ) + '/' + odt_filename + ')"'
            os.system(command)

            try:
                open(valid_filename, 'r')
            except IOError as e:
                print('Missing valid file (' + valid_filename +
                      ') for testing ' + original_filename)
                return

            tree, files, errors = transform(odt_filename)
            cnxml = clean_cnxml(etree.tostring(tree))
            validate_cnxml(cnxml)
            output = open(output_filename, 'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            process = subprocess.Popen(
                ['diff', valid_filename, output_filename],
                shell=False,
                stdout=subprocess.PIPE)
            std_output = process.communicate()

            if (std_output[0] != None and len(std_output[0]) != 0):
                diff_output = open(diff_filename, 'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print(
                    'Differences in the testing of ' + original_filename +
                    ', information on those differences has been placed in ' +
                    diff_filename)
            elif (std_output[1] != None and len(std_output[1]) != 0):
                err_output = open(err_filename, 'w')
                err_output.write(std_output[1])
                err_output.close()
                print(
                    'Error(s) occurred while attempting to test for differences in CNXML output of '
                    + original_filename +
                    ', information on these errors are in ' + err_filename)

Example #16

0

Show file

def enhance(request):
    check_login(request)
    session = request.session
    google_resource_id = ""
    slideshare_id = ""
    embed_google = False
    embed_slideshare = False
    not_converted = True
    show_iframe = False
    form = Form(request, schema=QuestionAnswerSchema)
    validate_form = form.validate()
    print form.all_errors()
    if session.has_key('google-resource-id'):
        google_resource_id = session['google-resource-id']
    if session.has_key('slideshare_id'):
        slideshare_id = session['slideshare_id']
        if fetch_slideshow_status(slideshare_id) == "2":
            not_converted = False
            show_iframe = True

    if google_resource_id != "":
        embed_google = True
    if slideshare_id != "":
        embed_slideshare = True
    templatePath = "templates/google_ss_preview.pt"
    if validate_form:
        introductory_paragraphs = request.POST.get('introductory_paragraphs')
        question_count = 0
        cnxml = session[
            "cnxml"] + """<content><section id="intro-section-title"><title id="introtitle">Introduction</title><para id="introduction-1">""" + introductory_paragraphs + """</para></section><section id="slides-embed"><title id="slide-embed-title">View the slides</title><figure id="ss-embed-figure"><media id="slideshare-embed" alt="slideshare-embed"><iframe src="http://www.slideshare.net/slideshow/embed_code/""" + slideshare_id + """" width="425" height="355" /></media></figure></section>"""
        for i in range(1, 6):
            form_question = request.POST.get('question-' + str(i))
            if form_question:
                form_radio_answer = request.POST.get(
                    'radio-' + str(i)
                )  #this give us something like 'answer-1-1'. so our solution is this
                question_count += 1
                if question_count == 1:
                    cnxml += """<section id="test-section"><title>Test your knowledge</title>"""
                itemlist = ""
                for j in range(1, 10):
                    try:

                        form_all_answers = request.POST.get('answer-' +
                                                            str(i) + '-' +
                                                            str(j))
                        if form_all_answers:
                            itemlist += "<item>" + form_all_answers + "</item>"

                    except:
                        print "No element found"

                if form_radio_answer:
                    solution = request.POST.get(form_radio_answer)
                    cnxml += """<exercise id="exercise-""" + str(
                        i
                    ) + """"><problem id="problem-""" + str(
                        i
                    ) + """"><para id="para-""" + str(i) + """">""" + str(
                        form_question
                    ) + """<list id="option-list-""" + str(
                        i
                    ) + """" list-type="enumerated" number-style="lower-alpha">""" + str(
                        itemlist) + """</list></para></problem>"""
                else:
                    print "ELESE CONDUITION OF radio"
                    solution = request.POST.get('answer-' + str(i) + '-1')
                    cnxml += """<exercise id="exercise-""" + str(
                        i) + """"><problem id="problem-""" + str(
                            i) + """"><para id="para-""" + str(
                                i) + """">""" + str(
                                    form_question) + """</para></problem>"""
                print "FORM RADIO ANSWER", form_radio_answer
                print "SOLUTION", solution
                cnxml += """ <solution id="solution-""" + str(
                    i
                ) + """"> <para id="solution-para-""" + str(
                    i
                ) + """">""" + solution + """</para></solution></exercise>"""
                """form_solution = request.POST.get('solution-'+str(i))
                all_post_data = {"data":{"options":form_options,"solution":form_solution,"question":form_question}}
                for question in all_post_data:
                    options = all_post_data[question]['options']
                    solution = all_post_data[question]['solution']
                    asked_question = all_post_data[question]['question']
                    optionlist=""
                    for option in options:
                        optionlist+="<item>"+option+"</item>"""
                #cnxml+="""<exercise id="exercise-"""+str(j)+""""><problem id="problem-"""+str(j)+""""><para id="para-"""+str(j)+"""">"""+str(asked_question)+"""<list id="option-list-"""+str(j)+"""" list-type="enumerated" number-style="lower-alpha">"""+str(optionlist)+"""</list></para></problem>"""
                #cnxml+=""" <solution id="solution-"""+str(j)+""""> <para id="solution-para-"""+str(j)+"""">"""+solution+"""</para></solution></exercise>"""
                #j+=1
        metadata = session['metadata']
        if question_count >= 1:
            cnxml += "</section></content></document>"
        else:
            cnxml += "</content></document>"
        workspaces = [(i['href'], i['title'])
                      for i in session['login'].collections]
        metadata_entry = sword2cnx.MetaData(metadata)
        zipped_filepath = session['userfilepath']
        zip_archive = zipfile.ZipFile(zipped_filepath, 'w')
        zip_archive.writestr("index.cnxml", cnxml)
        zip_archive.close()
        conn = sword2cnx.Connection("http://cnx.org/sword/servicedocument",
                                    user_name=session['login'].username,
                                    user_pass=session['login'].password,
                                    always_authenticate=True,
                                    download_service_document=True)
        collections = [{
            'title': i.title,
            'href': i.href
        } for i in sword2cnx.get_workspaces(conn)]
        session['login'].collections = collections
        workspaces = [(i['href'], i['title'])
                      for i in session['login'].collections]
        session['workspaces'] = workspaces
        with open(zipped_filepath, 'rb') as zip_file:
            deposit_receipt = conn.create(
                col_iri=workspaces[0][0],
                metadata_entry=metadata_entry,
                payload=zip_file,
                filename='upload.zip',
                mimetype='application/zip',
                packaging='http://purl.org/net/sword/package/SimpleZip',
                in_progress=True)
        session['dr'] = deposit_receipt
        session['deposit_receipt'] = deposit_receipt.to_xml()
        soup = BeautifulSoup(deposit_receipt.to_xml())
        data = soup.find("link", rel="edit")
        edit_iri = data['href']
        session['edit_iri'] = edit_iri
        creator = soup.find('dcterms:creator')
        username = session['login'].username
        email = creator["oerdc:email"]
        url = "http://connexions-oerpub.appspot.com/"
        post_values = {
            "username": username,
            "email": email,
            "slideshow_id": slideshare_id
        }
        data = urllib.urlencode(post_values)
        google_req = urllib2.Request(url, data)
        google_response = urllib2.urlopen(google_req)
        now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        temp_dir_name = '%s-%s' % (request.session['login'].username,
                                   now_string)
        save_dir = os.path.join(request.registry.settings['transform_dir'],
                                temp_dir_name)
        os.mkdir(save_dir)
        request.session['upload_dir'] = temp_dir_name
        cnxml = clean_cnxml(cnxml)
        save_cnxml(save_dir, cnxml, [])
        return HTTPFound(location=request.route_url('metadata'))

        #return HTTPFound(location=request.route_url('updatecnx'))

    response = {
        'form': FormRenderer(form),
        "slideshare_id": slideshare_id,
        "google_resource_id": google_resource_id,
        "embed_google": embed_google,
        "embed_slideshare": embed_slideshare,
        "not_converted": not_converted,
        "show_iframe": show_iframe
    }
    return render_to_response(templatePath, response, request=request)

Example #17

0

Show file

    def test_gdocs(self):
        have_test_file = False
        try:
            fp = open('./test_files/gdocs/test_files')
            fp.close()
            have_test_file = True
        except:
            print('No gdocs test file')

        doc_files = os.listdir(test_folder_name + 'doc/')
        rids = []
        i = 0
        while (i < len(doc_files)):
            f = doc_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != '.doc'):
                doc_files.remove(f)
            else:
                i = i + 1
        for d in doc_files:
            try:
                just_filename = os.path.basename(d)
                just_filename, extension = os.path.splitext(just_filename)
                rid = upload_doc(test_folder_name + 'doc/' + d,
                                 'application/msword', just_filename)
                rids.append(rid)
            except KeyboardInterrupt:
                exit()
            except:
                print('Error uploading ' + just_filename + ' to gdocs')

        if (have_test_file):
            fp = open('./test_files/gdocs/test_files')
            for url in fp:
                if (url[0] == '#'):
                    continue
                match_doc_id = re.match(
                    r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
                if match_doc_id:
                    rids.append('document:' + match_doc_id.group(1))
            fp.close()

        count = 0
        for rid in rids:
            if (count < len(doc_files)):
                filename = os.path.basename(doc_files[count])
                filename, ext = os.path.splitext(filename)
            else:
                filename = rid[9:]

            valid_filename = './test_files/gdocs/' + filename + '.cnxml'
            output_filename = './test_files/gdocs/' + filename + '.tmp'
            diff_filename = './test_files/gdocs/' + filename + '.diff'
            err_filename = './test_files/gdocs/' + filename + '.err'

            gdoc_url = construct_url(rid[9:])
            rid, original_title = get_gdoc(gdoc_url, './test_files/gdocs')
            html_filename = './test_files/gdocs/' + rid[9:] + '.htm'
            html_file = open(html_filename, 'r')
            try:
                html = html_file.read()
                html_file.flush()
            finally:
                html_file.close()
            cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
            cnxml = clean_cnxml(cnxml)
            validate_cnxml(cnxml)

            output = open(output_filename, 'w')
            output.write(cnxml)
            output.close()
            remove_ids(output_filename)
            os.remove('./test_files/gdocs/' + rid[9:] + '.htm')

            process = subprocess.Popen(
                ['diff', valid_filename, output_filename],
                shell=False,
                stdout=subprocess.PIPE)
            std_output = process.communicate()

            if (std_output[0] != None and len(std_output[0]) != 0):
                diff_output = open(diff_filename, 'w')
                diff_output.write(std_output[0])
                diff_output.close()
                print(
                    'Differences in the testing of gdoc ' + filename +
                    ', information on those differences has been placed in ' +
                    diff_filename)
            elif (std_output[1] != None and len(std_output[1]) != 0):
                err_output = open(err_filename, 'w')
                err_output.write(std_output[1])
                err_output.close()
                print(
                    'Error(s) occurred while attempting to test for differences in CNXML output of gdoc '
                    + filename + ', information on these errors are in ' +
                    err_filename)
            count = count + 1

Example #18

0

Show file

    def test_url(self):
        url_files = os.listdir(test_folder_name + 'url/')
        i = 0

        while (i < len(url_files)):
            f = url_files[i]
            filename, extension = os.path.splitext(f)
            if (extension != ''):
                url_files.remove(f)
            else:
                i = i + 1

        for f in url_files:
            input_file = open(test_folder_name + 'url/' + f, 'r')
            url = input_file.readline()
            input_file.close()

            output_filename = test_folder_name + 'url/' + f + '.cnxml'
            valid_filename = test_folder_name + 'url/' + f + '.cnxml'
            output_filename = test_folder_name + 'url/' + f + '.tmp'
            diff_filename = test_folder_name + 'url/' + f + '.diff'
            err_filename = test_folder_name + 'url/' + f + '.err'

            import_opener = urllib2.build_opener()
            import_opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            try:
                import_request = import_opener.open(url)
                html = import_request.read()

                # transformation
                cnxml, objects, html_title = htmlsoup_to_cnxml(
                    html, bDownloadImages=True, base_or_source_url=url)

                cnxml = clean_cnxml(cnxml)
                validate_cnxml(cnxml)

                output = open(output_filename, 'w')
                output.write(cnxml)
                output.close()
                remove_ids(output_filename)

                process = subprocess.Popen(
                    ['diff', valid_filename, output_filename],
                    shell=False,
                    stdout=subprocess.PIPE)
                std_output = process.communicate()

                if (std_output[0] != None and len(std_output[0]) != 0):
                    diff_output = open(diff_filename, 'w')
                    diff_output.write(std_output[0])
                    diff_output.close()
                    print(
                        'Differences in the testing of ' + f +
                        ', information on those differences has been placed in '
                        + diff_filename)
                elif (std_output[1] != None and len(std_output[1]) != 0):
                    err_output = open(err_filename, 'w')
                    err_output.write(std_output[1])
                    err_output.close()
                    print(
                        'Error(s) occurred while attempting to test for differences in CNXML output of '
                        + f + ', information on these errors are in ' +
                        err_filename)

            except urllib2.URLError, e:
                print('URL ' + url + ' could not be opened')
                quit()

Example #19

0

Show file

File: convert_all.py Project: oerpub/oerpub.rhaptoslabs.swordpushweb

for rid in rids:
    print(rid)
    if count < len(doc_files):
        filename = os.path.basename(doc_files[count])
        filename, ext = os.path.splitext(filename)
    else:
        filename = rid[9:]

    valid_filename = "./test_files/gdocs/" + filename + ".cnxml"

    gdoc_url = construct_url(rid[9:])
    rid, original_title = get_gdoc(gdoc_url, "./test_files/gdocs")
    html_filename = "./test_files/gdocs/" + rid[9:] + ".htm"
    html_file = open(html_filename, "r")
    try:
        html = html_file.read()
        html_file.flush()
    finally:
        html_file.close()

    cnxml, objects = gdocs_to_cnxml(html, bDownloadImages=True)
    cnxml = clean_cnxml(cnxml)
    validate_cnxml(cnxml)

    output = open(valid_filename, "w")
    output.write(cnxml)
    output.close()
    remove_ids(valid_filename)
    count = count + 1
    os.remove("./test_files/gdocs/" + rid[9:] + ".htm")