Ejemplo n.º 1
0
 def docx_to_html(self, file, method=None):
     print('entering docx to html')
     if file.startswith('\\'):
         print('connecting to SMB share')
         try:
             with smbclient.open_file(r"{}".format(file),
                                      mode='rb',
                                      username=smb_username,
                                      password=smb_password) as f:
                 html = mammoth.convert_to_html(f).value
                 print('file found')
         except:
             smbclient.reset_connection_cache()
             with smbclient.open_file(r"{}".format(file),
                                      mode='rb',
                                      username=smb_username,
                                      password=smb_password) as f:
                 html = mammoth.convert_to_html(f).value
                 print('file found')
         finally:
             smbclient.reset_connection_cache()
     else:
         print('local')
         file = document_location + file
         html = mammoth.convert_to_html(file).value
     return html
def table_extraction(Path):
    # ---- Converting to HTML file
    html = mammoth.convert_to_html(Path).value

    # ---- Creating Beautiful object
    soup = BeautifulSoup(html, "html.parser")

    tables = []
    for i in soup.find_all('table'):
        rows = []
        for j in i.find_all('tr'):
            cells = []
            for k in j.find_all('td'):
                if k.text:
                    temp = str(k).replace('<strong>', 'start_bold').replace(
                        '</strong>', 'end_bold'
                    ).replace("<br>", '\n').replace(
                        "<br/>", '\n'
                    )  # There might be new line in a cell and to capture it we need to replace the break tag
                    cells.append(
                        (BeautifulSoup(temp, 'html.parser').text).replace(
                            'start_bold', '&lt;b&gt;').replace(
                                'end_bold',
                                '&lt;/b&gt;').replace('\u200b',
                                                      '').replace('\xa0',
                                                                  '').strip())
            if len(cells) > 0:
                rows.append(cells)
        if len(rows) > 0:
            tables.append(rows)

    return tables, soup
Ejemplo n.º 3
0
def processDoc(documentPath, htmlDirectory):

    basename = os.path.basename(documentPath)[0:-5].replace(" ", "_")
    goodDirectory = htmlDirectory + "goodFiles/"
    badDirectory = htmlDirectory + "badFiles/"

    try:
        with open(documentPath, "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file)
            html = result.value

        soup = BeautifulSoup(html, 'html.parser')

        regExpList = [
            r'Coffman', r'149 Atlantic', r'Swampscott', r'\$\d*\.\d\d'
        ]  # strip dollar amounts, etc.
        stripInfo(soup, regExpList)

        html = str(soup)
        # determine if file is annotated and place accordingly
        if "LC Class" in html:
            htmlPath = goodDirectory + basename + '.html'
        else:
            htmlPath = badDirectory + basename + '.html'

        with open(htmlPath, 'w') as fp:
            fp.write(html)
    except:
        print("html conversion error: " + documentPath)
Ejemplo n.º 4
0
def converter(in_file, out_file):
    in_file = os.path.abspath(in_file)
    out_file = os.path.abspath(out_file)

    in_ext = get_extension(in_file)
    out_ext = get_extension(out_file)

    if in_ext == "docx":
        extensions = ["jpg", "pdf", "pdfa", "png", "tiff", "txt", "zip"]
        if out_ext in extensions:
            result = convertapi.convert(out_ext, {'File': in_file},
                                        from_format=in_ext)
            # print(result)
            result.file.save(out_file)
            print(True)
        elif out_ext == "html":
            with open(in_file, "rb") as docx_file:
                result = mammoth.convert_to_html(docx_file)
                html = result.value
                with open(out_file, "w+") as f:
                    f.write(html)
            print(True)
        else:
            print(False)
            print("Invalid output extension")
    else:
        print(False)
        print("Invalid input extension")
def convert(docfile, htmlfile):
    f = open(docfile, 'rb')
    b = open(htmlfile, 'wb')
    document = mammoth.convert_to_html(f)
    b.write(document.value.encode('utf8'))
    f.close()
    b.close()
Ejemplo n.º 6
0
    def parseTables(self, table, index, soup):

        keys = ("Field1", "Field2", "Field3", "Field4", "Field5", "Field6",
                "Field7", "Field8", "Field9", "Field10", "Field11", "Field12",
                "Field13")
        subKeys = ("Sub-Field-1", "Sub-Field-2", "Sub-Field-3", "Sub-Field-4",
                   "Sub-Field-5", "Sub-Field-6", "Sub-Field-7", "Sub-Field-8",
                   "Sub-Field-9", "Sub-Field-10", "Sub-Field-11",
                   "Sub-Field-12", "Sub-Field-13", "Sub-Field-14",
                   "Sub-Field-15", "Sub-Field-16", "Sub-Field-17",
                   "Sub-Field-18", "Sub-Field-19", "Sub-Field-20",
                   "Sub-Field-21")
        for i, column in enumerate(table.columns):
            result = (mammoth.convert_to_html(cell) for cell in column.cells)
            text = (cell.text.strip() for cell in column.cells)
            if i == 0:
                continue

            if i == 2:
                continue
            row_data = dict(zip(keys, text))
            self.data.append(row_data)

        sub = []
        #        self.progress.setValue(index)
        print(index)
        for x in range(21):
            sub.append(table.cell(13 + x, 2).text)
        sub_data = dict(zip(subKeys, sub))
Ejemplo n.º 7
0
 def docx_to_html(cls, filepath):
     with open(filepath, "rb") as docx_file:
         result = mammoth.convert_to_html(docx_file)
         html = result.value  # The generated HTML
         print(html)
         messages = result.messages  # Any messages, such as warnings during conversion
         print(messages)
Ejemplo n.º 8
0
    def doctorPython(self):
        #        self.worker.moveToThread(self.thread)
        print('here')

        #        self.worker.start()
        #        self.thread.started.connect(self.worker.doctorPython)
        #        self.thread.start()
        #        self.worker.emit('hello there')
        #        self.worker.dropped.connect(self.btn_click)

        #        self.doctorPython(self.doc)
        #        self.thread.start()
        self.b1.setEnabled(False)

        if (self.doc != 'hello' and self.butt == False):
            self.butt = True
            document = Document(self.doc)
            table = document.tables[0]
            with open(self.doc, "rb") as docx_file:
                result = mammoth.convert_to_html(docx_file)
                html = result.value  # The generated HTML
                messages = result.messages  # Any messages, such as warnings during conversion
                soup = BeautifulSoup(html, "lxml")
                print('hello hello')
            for x in range(len(document.tables)):
                print('doctor')
                #                QtCore.QCoreApplication.processEvents()
                self.parseTables(document.tables[x], x, soup)
            subprocess.Popen(r'explorer /select,"data.json"')
            open('data.json', 'w').close()
            with open('data.json', 'w') as outfile:
                json.dump(self.data, outfile)
            self.butt = False
Ejemplo n.º 9
0
def explicit_style_map_is_combined_with_embedded_style_map():
    with open(test_path("embedded-style-map.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj,
                                         style_map="r => strong")
        assert_equal("<h1><strong>Walking on imported air</strong></h1>",
                     result.value)
        assert_equal([], result.messages)
Ejemplo n.º 10
0
    def read_meta_at_path(self, filepath):
        meta = {}
        with open(filepath, "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file, **self.get_mammoth_options())
            html = result.value

            if len(result.messages) > 0:
                self.debug('Messages while reading ' + filepath)
                for message in result.messages:
                    self.debug(str(message))
            d = pq('<body>' + html + '</body>')
            self.clean_html(d)

            concept_columns = d.find('table > tr > td:first-child > p')
            for concept_column_left in concept_columns:
                concept_name = self.parse_concept_name(concept_column_left, d)
                if self.is_concept_name_valid(concept_name):
                    concept_column_right = d(concept_column_left).parent().siblings('td')
                    footnotes = self.parse_footnotes(concept_column_right, d)
                    if self.is_concept_value_valid(concept_column_right, d):
                        concept_value = self.parse_concept_value(concept_column_right, d)
                        if len(footnotes) > 0:
                            concept_value += self.wrap_footnotes(footnotes)
                        concept_key = self.get_concept_key(concept_name)
                        meta[concept_key] = concept_value

        return meta
Ejemplo n.º 11
0
def docx2html(infile):
    with open(infile, 'rb') as fp:
        mammout = mammoth.convert_to_html(fp)
    for m in mammout.messages:
        print("Mammoth %s: %s" % (m.type, m.message))

    return prettyprint_html(mammout.value, infile)
Ejemplo n.º 12
0
def convert_text(filename):
    """Convert the post/page content using the converters"""
    text_content = open(filename, "r")
    if ".md" in filename:
        text_cont1 = "\n" + markdown.markdown(text_content.read()) + "\n"
    elif ".docx" in filename:
        with open(os.path.join(cwd, "content", filename), "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file)
            final_docx_html = result.value
        text_cont1 = "\n" + final_docx_html + "\n"
    elif ".tile" in filename:
        text_cont1 = "\n" + textile.textile(text_content.read()) + "\n"
    elif ".jade" in filename:
        text_cont1 = "\n" + pyjade.simple_convert(text_content.read()) + "\n"
    elif ".rst" in filename:
        text_cont1 = "\n" + \
            publish_parts(text_content.read(), writer_name='html')[
                'html_body'] + "\n"
    elif ".html" in filename:
        text_cont1 = text_content.read()
    elif ".txt" in filename:
        text_cont1 = text_content.read()
    else:
        print(filename + " is not a valid file type!")
        text_cont1 = "NULL"

    return text_cont1 + "\n\n"
Ejemplo n.º 13
0
  def post(self, request, format=None):
    # tipo_analisis = request.POST['tipo_analisis']
    uploaded_file = request.FILES['file']
    file_name = uploaded_file.name
    file_extension = file_name.split(".")[1]

    destination = open('backendFondecyt/Docs/' + file_name, 'wb+')
    for chunk in uploaded_file.chunks():
      destination.write(chunk)
    destination.close()

    if (file_extension == "doc"):
      file_name = self.converDocToDocx(file_name)
    if (file_extension == "doc" or file_extension == "docx"):
      with open('backendFondecyt/Docs/' + file_name, "rb") as docx_file:
        rawText = mammoth.extract_raw_text(docx_file).value
        html = mammoth.convert_to_html(docx_file).value
    if (file_extension == "txt"):
      txt_file = open('backendFondecyt/Docs/' + file_name, "r", encoding="utf-8")
      rawText = txt_file.read()
      html = ""
      for line in txt_file:
        stripped_line = line.rstrip()
        if (stripped_line.strip() != ""): 
          html += "<p>" + line + "</p>"
      txt_file.close()
    
    payload = {'texto': rawText, 'html': html}
    data = requests.post('http://redilegra.com/general', data=payload)
    data = json.loads(data.text.encode('utf8'))
    os.remove('backendFondecyt/Docs/' + file_name)
    return Response(data, status.HTTP_201_CREATED)
Ejemplo n.º 14
0
def images_stored_outside_of_document_are_included_in_output():
    with open(test_path("external-picture.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal(
            """<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""",
            result.value)
        assert_equal([], result.messages)
Ejemplo n.º 15
0
def embedded_style_map_is_used_if_present():
    with _copy_of_test_data("single-paragraph.docx") as fileobj:
        mammoth.embed_style_map(fileobj, "p => h1")
        result = mammoth.convert_to_html(fileobj=fileobj,
                                         ignore_empty_paragraphs=False)
        assert_equal("<h1>Walking on imported air</h1>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 16
0
def docx2json(fileName):
    with open(path + "\\docFiles\\" + fileName + ".docx", "rb") as docx_file:
        #convert to HTML
        result = mammoth.convert_to_html(docx_file)
        html = result.value  # The generated HTML
        #split each html by heading
        htmlsplit = html.split("<h1>")
        htmlsplit = ["<h1>" + html for html in htmlsplit]
        for unit in htmlsplit[1:]:
            soup = BeautifulSoup(unit, 'html.parser')
            title = soup.h1.string
            data = {}
            #extract h1 text for file name
            cleanh1 = re.sub('[^0-9a-zA-Z]+', ' ', str(soup.h1.string))
            data['title'] = title
            htmlPath = path + "\\htmlFiles\\" + cleanh1 + ".html"
            with open(htmlPath, "w") as html:
                html.write(str(soup))
            print("Converstion of " + fileName + ".docx completed")
            #remove h1 from json
            for tag in soup.find_all('h1'):
                tag.replaceWith('')
            data['body'] = str(soup)
            json_data = json.dumps(data)
            print("Converstion of JSON for " + fileName + ".docx started")
            with open(path + "\\jsonFiles\\" + cleanh1 + ".json", "w") as file:
                file.write(json_data)
            print("Converstion of JSON for " + fileName + ".docx completed")
Ejemplo n.º 17
0
def sendNotifyUpload(req):
    if(req.method == 'POST'):
        try:
            result = {'status': 1}
            username = req.POST.get('username')
            token = req.POST.get('token')
            f = req.FILES.get('file')
            title = req.POST.get('title')
            user = models.User.objects.get(username=username)
            if(token == getToken(user, token_exp_time)):
                if(user.user_type == 2):
                    updateToken(user)
                    # do real work here
                    f.seek(0)
                    converted = mammoth.convert_to_html(f)
                    html = converted.value
                    models.Notify.objects.create(
                        title=title, link=converted.value)
                    result['status'] = 0
                else:
                    result['message'] = '无操作权限'
            else:
                result['status'] = -1
                result['message'] = '用户未登录'
        except Exception as e:
            print(e)
            result['message'] = '请求无效'
        finally:
            return JsonResponse(result)
Ejemplo n.º 18
0
def docx2html(path: str):
    """Конвертация docx в html
       :param path: путь к docx файлам
    """
    import mammoth
    if check_path(path):
        return
    path = os.path.join(DEFAULT_FOLDER, path)
    if not path.startswith(DEFAULT_FOLDER):
        return
    files = ListDir(path)
    for item in files:
        cur_item = os.path.join(path, item)
        if not item.endswith('.docx'):
            drop_file(cur_item)
            continue
        with open(full_path(cur_item), 'rb') as docx_file:
            try:
                result = mammoth.convert_to_html(docx_file)
            except Exception as e:
                drop_file(cur_item)
                logger.info('[ERROR]: %s' % e)
                continue
            html = result.value
            #messages = result.messages
            dest = os.path.join(path, item.replace('.docx', '.html'))
            with open_file(dest, 'w+') as f:
                f.write(html)
            drop_file(cur_item)
Ejemplo n.º 19
0
    def extract(self, target_dir):

        self.target_dir = target_dir

        files = os.listdir(self.docx_dir)

        files = [
            os.path.join(self.docx_dir, f) for f in files
            if f.endswith(".docx")
        ]
        #files.sort(key = lambda x: os.path.getmtime(x))
        #files.reverse()
        for i, docx_filename in enumerate(sorted(files)):
            print(docx_filename)

            self.image_dir = "blog" + str(i) + "_images"
            if not os.path.exists(os.path.join(target_dir, self.image_dir)):
                os.mkdir(os.path.join(target_dir, self.image_dir))

            html_name = "blog" + str(i) + "-" + str(
                docx_filename.split("/")[-1]).replace(".docx", ".html")
            with open(target_dir + "/" + html_name, 'w') as html_file:
                with open(docx_filename, "rb") as docx_file:
                    result = mammoth.convert_to_html(
                        docx_file,
                        convert_image=mammoth.images.img_element(
                            self.convert_image))
                    html = result.value  # The generated HTML
                    messages = result.messages  # Any messages, such as warnings during conversion
                    if messages:
                        print("Parsing Message: " + str(messages))

                    html_file.write(html)
Ejemplo n.º 20
0
def upload_file(request):
    users = User.objects.filter(is_active=True).order_by('email')
    if request.method == 'POST':
        form = DocumentForm(request.POST, request.FILES)

        if form.is_valid():
            doc = form.save(commit=False)
            doc.created_by = request.user
            doc.save()
            if doc.document:
                if str(doc.document).split('.')[1] == 'docx':

                    print(doc.document)
                    with open("media/" + str(doc.document).replace(" ", "_"),
                              "rb") as docx_file:
                        result = mammoth.convert_to_html(docx_file)
                        html = result.value  # The generated HTML
                    print(html)
                    doc.created_by = str(request.user)
                    doc.doc_body = html
                doc.save()

    else:
        form = DocumentForm()

    return render(request, "upload_file.html", {
        'form': form,
    })
Ejemplo n.º 21
0
    def open_File(self):
        if self.textEdit.toPlainText() == self.saved_data:
            self.textEdit.setText("")
            pass

        # Change textEdit to QWebview
        # Read contents of the pdf file
        # SetContent() of the pdf to the QWebView

        name = QtGui.QFileDialog.getOpenFileName(
            self, 'Open File', os.getenv('HOME'),
            "All files(*.*);;(*.pdf);;txt(*.txt);;doc(*.doc)")
        if os.path.splitext(name)[1] == ".pdf":
            print(name)
            webbrowser.open_new(name)
        elif os.path.splitext(name)[1] == ".doc":
            with open(name, "rb") as docx_file:
                # Check if image is present in the document or not
                result = mammoth.convert_to_html(docx_file)
                html = result.value  # The generated HTML
                messages = result.messages  # Any messages, such as warnings during conversion

        else:
            if name:
                with open(name, 'r') as stream:
                    self.opendFileText = stream.read()
                    self.saved_data = self.opendFileText
                    self.textEdit.setText(self.opendFileText)
                self.current_save_file_path = name

        self.setWindowTitle(name + "Qt Mini text Editor")
Ejemplo n.º 22
0
def table_content_list(output_file):
    html = mammoth.convert_to_html(output_file).value
    soup = BeautifulSoup(html, "html.parser")
    # print("soup------->",soup)
    table_content_list_all = []
    for tables in soup.find_all('table'):
        for row in tables.find_all('tr'):
            column_list = []
            for column in row.find_all('td'):
                #             column_list.append(str(column).replace('<td>','').replace('</td>','').replace('</p>','').replace('<p>','').replace('<td colspan="2">','').strip())
                raw_html = str(column).replace(
                    '<strong>',
                    'start_bold').replace('</strong>',
                                          'end_bold').replace('</p>',
                                                              '\n').strip()
                cleantext = BeautifulSoup(raw_html, "lxml").text
                cleantext = cleantext.replace('start_bold', '<b>').replace(
                    'end_bold', '</b>')
                cleantext = cleantext.replace('<', '&lt;').replace(
                    '>', '&gt;').replace('\n', '')
                column_list.append(cleantext.strip())
            column_list = [i for i in column_list if i]
            #         print(column_list)
            table_content_list_all.append(column_list)

    table_content_list_all = [x for x in table_content_list_all if x != []]
    return table_content_list_all
Ejemplo n.º 23
0
def test_getHTML(filename):

    with open(filename, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value  # The generated HTML
        messages = result.messages  # Any messages, such as warnings during conversion
        print(html)
        print(messages)
Ejemplo n.º 24
0
def inline_images_referenced_by_path_relative_to_base_are_included_in_output():
    with open(test_path("tiny-picture-target-base-relative.docx"),
              "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal(
            """<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""",
            result.value)
        assert_equal([], result.messages)
Ejemplo n.º 25
0
def docx_to_html():  #docx_to_html
    f = open("Main/test.docx", 'rb')
    b = open('Main/test1.html', 'wb')
    document = mammoth.convert_to_html(f)
    b.write(document.value.encode('utf8'))
    f.close()
    b.close()
    print('Done!')
def display_document(doc_id):
    with open("E:\Emanuel\storedFiles\{}.docx".format(doc_id),
              "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        str_res = result.value
        for word in words_to_bold:
            str_res = str_res.replace(word, '<b>{}</b>'.format(word))
        return render_template('resalt.html', text=str_res)
Ejemplo n.º 27
0
 def word_to_html(self, owner, new_file_name):
     with open(FILES_PATH + new_file_name, "rb") as docx_file:
         result = mammoth.convert_to_html(docx_file)
     html = result.value
     #messages = result.messages
     self.insert_file(owner, new_file_name.replace(".docx", ".txt"), html)
     os.remove("files/" + new_file_name)
     print html
Ejemplo n.º 28
0
def edit_generated(generated_document_name):
    document = os.path.join(app.config['AUTOTEMPLATE_UPLOAD_FOLDER'],
                            generated_document_name)
    with open(document, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value  # The generated HTML
        #print(html)
        return html
Ejemplo n.º 29
0
 def pdf2docx_pdf_html(input_pdf, input_docx_location):
     # docx_name = r"/Users/vigneshramamurthy/opt/anaconda3/Workscripts/Spyder/Sprint 2.21 -  Mexico Unilever, Sydney Chennai, APAC Pepsico Penang, APAC Sydeny:Chennai /Accounts/GWF/Script/Word_location/Test/xx.docx"
     parse(input_pdf, input_docx_location)
     # parse(input_pdf, docx_name, start=page_no - 1, end=page_no)
     x = mammoth.convert_to_html(input_docx_location,
                                 style_map="b => b").value
     html = BeautifulSoup(x, 'html.parser')
     return html
Ejemplo n.º 30
0
def docx_to_html(docx_dir):
    with open(docx_dir, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value
        messages = result.messages
        if len(messages) == 0:
            print("No Errors Encountered")
        return html
Ejemplo n.º 31
0
def warn_if_images_stored_outside_of_document_are_specified_when_passing_fileobj_without_name():
    fileobj = io.BytesIO()
    with open(test_path("external-picture.docx"), "rb") as source_fileobj:
        shutil.copyfileobj(source_fileobj, fileobj)
    
    result = mammoth.convert_to_html(fileobj=fileobj)
    assert_equal("", result.value)
    assert_equal([results.warning("could not find external image 'tiny-picture.png', fileobj has no name")], result.messages)
Ejemplo n.º 32
0
def warning_if_style_mapping_is_not_understood():
    style_map = """
!!!!
p => h1"""
    with open(test_path("single-paragraph.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, style_map=style_map)
        assert_equal("<h1>Walking on imported air</h1>", result.value)
        warning = "Did not understand this style mapping, so ignored it: !!!!"
        assert_equal([results.warning(warning)], result.messages)
Ejemplo n.º 33
0
def relationships_are_handled_properly_in_footnotes():
    expected_html = (
        '<p><sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup></p>' +
        '<ol><li id="doc-42-footnote-1"><p> <a href="http://www.example.com">Example</a> <a href="#doc-42-footnote-ref-1">↑</a></p></li></ol>')
    
    with open(test_path("footnote-hyperlink.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42")
        assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 34
0
def transform_document_is_applied_to_document_before_conversion():
    def transform_document(document):
        document.children[0].style_id = "Heading1"
        return document
    
    with open(test_path("single-paragraph.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, transform_document=transform_document)
        assert_equal("<h1>Walking on imported air</h1>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 35
0
def endnotes_are_appended_to_text():
    expected_html = ('<p>Ouch' +
        '<sup><a href="#doc-42-endnote-2" id="doc-42-endnote-ref-2">[1]</a></sup>.' +
        '<sup><a href="#doc-42-endnote-3" id="doc-42-endnote-ref-3">[2]</a></sup></p>' +
        '<ol><li id="doc-42-endnote-2"><p> A tachyon walks into a bar. <a href="#doc-42-endnote-ref-2">↑</a></p></li>' +
        '<li id="doc-42-endnote-3"><p> Fin. <a href="#doc-42-endnote-ref-3">↑</a></p></li></ol>')
    
    with open(test_path("endnotes.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42")
        assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 36
0
 def convert_file(docx_path, styles_path):
     if styles_path is not None:
         with open(styles_path) as styles_file:
             styles = styles_file.read()
     else:
         styles = None
     
     with open(docx_path, "rb") as docx_file:
         result = mammoth.convert_to_html(docx_file, styles=styles)
         self._view_model.html = result.value
         self._view_model.messages = result.messages
Ejemplo n.º 37
0
def clean_html(f,m):

    _r = mammoth.convert_to_html(f,style_map=m)
    _dr = html.fromstring(_r.value)

    # add 'word' class at top
    _dr.xpath('//div')[0].attrib['class'] = 'wordsection1'

    _r.value = etree.tostring(_dr, encoding='unicode', pretty_print=True).encode('ascii', 'xmlcharrefreplace')
    _r.value = _r.value.decode('utf-8')

    return(_r)
Ejemplo n.º 38
0
def footnotes_are_appended_to_text():
    # TODO: don't duplicate footnotes with multiple references
    expected_html = ('<p>Ouch' +
        '<sup><a href="#doc-42-footnote-1" id="doc-42-footnote-ref-1">[1]</a></sup>.' +
        '<sup><a href="#doc-42-footnote-2" id="doc-42-footnote-ref-2">[2]</a></sup></p>' +
        '<ol><li id="doc-42-footnote-1"><p> A tachyon walks into a bar. <a href="#doc-42-footnote-ref-1">↑</a></p></li>' +
        '<li id="doc-42-footnote-2"><p> Fin. <a href="#doc-42-footnote-ref-2">↑</a></p></li></ol>')
    
    with open(test_path("footnotes.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-")
        assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 39
0
def when_style_mapping_is_defined_for_comment_references_then_comments_are_included():
    expected_html = (
        '<p>Ouch' +
        '<sup><a href="#doc-42-comment-0" id="doc-42-comment-ref-0">[MW1]</a></sup>.' +
        '<sup><a href="#doc-42-comment-2" id="doc-42-comment-ref-2">[MW2]</a></sup></p>' +
        '<dl><dt id="doc-42-comment-0">Comment [MW1]</dt><dd><p>A tachyon walks into a bar. <a href="#doc-42-comment-ref-0">↑</a></p></dd>' +
        '<dt id="doc-42-comment-2">Comment [MW2]</dt><dd><p>Fin. <a href="#doc-42-comment-ref-2">↑</a></p></dd></dl>'
    )
    
    with open(test_path("comments.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, id_prefix="doc-42-", style_map="comment-reference => sup")
        assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 40
0
def footnotes_are_appended_to_text():
    # TODO: don't duplicate footnotes with multiple references
    expected_html = ('<p>Ouch' +
        '<sup><a href="#footnote-42-1" id="footnote-ref-42-1">[1]</a></sup>.' +
        '<sup><a href="#footnote-42-2" id="footnote-ref-42-2">[2]</a></sup></p>' +
        '<ol><li id="footnote-42-1"><p> A tachyon walks into a bar. <a href="#footnote-ref-42-1">↑</a></p></li>' +
        '<li id="footnote-42-2"><p> Fin. <a href="#footnote-ref-42-2">↑</a></p></li></ol>')
    
    with open(test_path("footnotes.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, generate_uniquifier=lambda: 42)
        # TODO: get rid of warnings
        #~ assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 41
0
def word_tables_are_converted_to_html_tables():
    expected_html = ("<p>Above</p>" +
        "<table>" +
        "<tr><td><p>Top left</p></td><td><p>Top right</p></td></tr>" +
        "<tr><td><p>Bottom left</p></td><td><p>Bottom right</p></td></tr>" +
        "</table>" +
        "<p>Below</p>")
    
    
    with open(test_path("tables.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal([], result.messages)
        assert_equal(expected_html, result.value)
Ejemplo n.º 42
0
def warn_if_images_stored_outside_of_document_are_not_found():
    with tempman.create_temp_dir() as temp_dir:
        document_path = os.path.join(temp_dir.path, "document.docx")
        with open(document_path, "wb") as fileobj:
            with open(test_path("external-picture.docx"), "rb") as source_fileobj:
                shutil.copyfileobj(source_fileobj, fileobj)
    
        with open(document_path, "rb") as fileobj:
            result = mammoth.convert_to_html(fileobj=fileobj)
            assert_equal("", result.value)
            expected_warning = "could not open external image: 'tiny-picture.png'"
            assert_equal("warning", result.messages[0].type)
            assert result.messages[0].message.startswith(expected_warning), "message was: " + result.messages[0].message
            assert_equal(1, len(result.messages))
Ejemplo n.º 43
0
def convert_docx_to_html(file_obj):
    """ 
    Convert a docx file-like object to html.

    Return a sanitized, django-safe html string, or raise a 
    ConversionError if something went wrong.
    """

    try:
        result = mammoth.convert_to_html(file_obj)
    except Exception as exc:
        logger.info('Conversion error ' + str(exc))
        raise ConversionError(exc)

    html = sanitize_html(result.value)

    return mark_safe(html)
Ejemplo n.º 44
0
def load_document(document_url):
    import mammoth, os
    with open(document_url, 'rb') as docx_file:
        data = []
        result = mammoth.convert_to_html(docx_file)
        soup = BeautifulSoup(result.value)
        paras = soup.findAll('p')
        title = document_url.split('/')[-1].replace('.docx', '')
        for idx, p in enumerate(paras):
            if len(p.findAll('strong')) > 0:
                if title is None and idx is 0:
                    pass
                else:
                    data.append("<h3>%s</h3>" % p.text.replace(':', ' '))
            else:
                data.append("<p>%s</p>" % p.text)
    
        return title, ''.join(data)
Ejemplo n.º 45
0
def upload():
    if request.method == 'GET':
        return render_template('upload.html', message='')
    elif request.method == 'POST':
        docfile = request.files['document']
        if docfile and allowed_file(docfile.filename):
            filename = secure_filename(docfile.filename)
            docfile.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            doc_html = mammoth.convert_to_html(docfile).value
            data = {'wrongFileType': False, 'message': doc_html}
            response = make_response(json.dumps(data), 200)
            response.headers['Content-Type'] = 'application/json'
            return response
        else:
            data = {'wrongFileType': True}
            response = make_response(json.dumps(data), 200)
            response.headers['Content-Type'] = 'application/json'
            return response
Ejemplo n.º 46
0
    def processFile(filename, reviews):
        style_map = "u => em"

        with open(filename, "rb") as docx_file:
            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html = result.value # The generated HTML
            paras = html.split('<p>')
            currentRotation = None

            parsedReview = None
            lastParsedReview = None

            for p in paras:
                s = p[:-4].strip()
                #s = s.replace(u'\xa0', ' ').replace(u'\u2013', '-')
                s = unicodedata.normalize('NFKC', s).replace(u'\u2013', '-')
                if len(s) > 0 and s[0] != '<':
                    if len(s) > 0:
                        if len(s) < 10:
                            s = s[:-1].rstrip()
                            if s=='H' or s=='M' or s=='L' or s=='R/N':
                                currentRotation = s
                                albumName = None
                                albumReview = None
                                waitingForAlbum = True
                        else:
                            if parsedReview is None:
                                if ReviewParser.isNameString(s):
                                    parsedReview = ReviewParser(filename, currentRotation)
                                    parsedReview.parseNameString(s)
                                    lastParsedReview = None
                                elif not (lastParsedReview is None):
                                    # did somebody put a newline in the middle of a review? Try to add it to the last parsedReview
                                    lastParsedReview.parseReviewString(s)
                            else:
                                parsedReview.parseReviewString(s)
                                reviews.append(parsedReview.review)
                                lastParsedReview = parsedReview
                                parsedReview = None
Ejemplo n.º 47
0
def empty_paragraphs_are_preserved_if_ignore_empty_paragraphs_is_false():
    with open(test_path("empty.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False)
        assert_equal("<p></p>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 48
0
def docx_containing_one_paragraph_is_converted_to_single_p_element():
    with open(test_path("single-paragraph.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal("<p>Walking on imported air</p>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 49
0
def text_boxes_are_read():
    with open(test_path("text-box.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal('<p>Datum plane</p>', result.value)
Ejemplo n.º 50
0
def createHtml(folderName, part, html):
    filename = folderName+'/content/'+part+'.html'
    with open(filename,'w') as html_file:
        html_file.write(html)
        
def convert_image(image):
    with image.open() as image_bytes:
        encoded_src = base64.b64encode(image_bytes.read()).decode("ascii")

    return {
        "src": "data:{0};base64,{1}".format(image.content_type, encoded_src)
    }

with open("ll.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(docx_file)
    html = result.value
    counter = 100001
    soup = BeautifulSoup(html)
    table = soup.find('table')
    trs = table.findAll('tr', recursive=False)
    book_code = 'protec_book_' + trs[0].findAll('td')[1].text
    code = str(trs[0].findAll('td')[1].text)+'_'+str(counter)
    book_number = trs[0].findAll('td')[1].text
    book_name = trs[3].findAll('td')[1].text
    book_desc = "".join([str(x) for x in trs[3].findAll('td')[2].contents])
    
    name = book_name
    if trs[3].findAll('td')[2].text == '':
        _type = 'folder'
        url = ''
Ejemplo n.º 51
0
def underline_is_ignored_by_default():
    with open(test_path("underline.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal('<p><strong>The Sunset Tree</strong></p>', result.value)
Ejemplo n.º 52
0
def underline_can_be_configured_with_convert_underline_option():
    with open(test_path("underline.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, convert_underline=mammoth.underline.element("em"))
        assert_equal('<p><strong>The <em>Sunset</em> Tree</strong></p>', result.value)
Ejemplo n.º 53
0
def underline_can_be_configured_with_style_mapping():
    with open(test_path("underline.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, style_map="u => em")
        assert_equal('<p><strong>The <em>Sunset</em> Tree</strong></p>', result.value)
Ejemplo n.º 54
0
def strikethrough_is_converted_to_s_element_by_default():
    with open(test_path("strikethrough.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal("<p><s>Today's Special: Salmon</s> Sold out</p>", result.value)
Ejemplo n.º 55
0
def strikethrough_conversion_can_be_configured_with_style_mapping():
    with open(test_path("strikethrough.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj, style_map="strike => del")
        assert_equal("<p><del>Today's Special: Salmon</del> Sold out</p>", result.value)
Ejemplo n.º 56
0
def can_read_xml_files_with_utf8_bom():
    with open(test_path("utf8-bom.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal("<p>This XML has a byte order mark.</p>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 57
0
def empty_paragraphs_are_ignored_by_default():
    with open(test_path("empty.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal("", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 58
0
def embedded_style_map_is_used_if_present():
    with _copy_of_test_data("single-paragraph.docx") as fileobj:
        mammoth.embed_style_map(fileobj, "p => h1")
        result = mammoth.convert_to_html(fileobj=fileobj, ignore_empty_paragraphs=False)
        assert_equal("<h1>Walking on imported air</h1>", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 59
0
def images_stored_outside_of_document_are_included_in_output():
    with open(test_path("external-picture.docx"), "rb") as fileobj:
        result = mammoth.convert_to_html(fileobj=fileobj)
        assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
        assert_equal([], result.messages)
Ejemplo n.º 60
0
import mammoth


style_map = """
p[style-name='Section Title'] => h1:fresh
p[style-name='Subsection Title'] => h2:fresh
"""
# KeyError: "There is no item named 'word/styles.xml' in the archive"
with open("../../res/paper.doc", "rb") as doc_file:
    print(doc_file.name)
    result = mammoth.convert_to_html(doc_file, style_map=style_map)
    html = result.value  # The generated HTML
    messages = result.messages  # Any messages, such as warnings during conversion
    print(html)