Ejemplo n.º 1
0
    def save_file(self, filePath):
        docF = DocReader(filePath)
        
        rawCont = docF.read()
        rawCont = rawCont.decode(CODING)
        
        self.readChars = 0
        size = len(rawCont)
        contentParted = rawCont.partition(NEW_LINE)
        caption = contentParted[0]
        content = contentParted[2]
        paraSplitter = '%s\x0c' %(NEW_LINE)
        paragraphs = content.split(paraSplitter)
#         print filePath, "got %d paragraphs" %(len(paragraphs))
        
        FILE_ID = self.dbi.addFile(path.basename(filePath), caption)
        for para in paragraphs:
            paraCaption, __, paraContent = para.partition(NEW_LINE)
            self.readChars += len(para) + len(paraSplitter)
            PARAGRAPH_ID = self.dbi.addParagraph(paraCaption, FILE_ID)
            curRdNr = None
            rdNrContent = []
            for line in paraContent.split(NEW_LINE):
                # eigentlich werden randnummern in textboxes als eigene Zeile erkannt...
                if line.isdigit():
                    if curRdNr != None:
                        RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID)
                        self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID)
                    rdNrContent = []
                    curRdNr = int(line)
#                     self.readChars += len(line) + len(NEW_LINE)
                # ... wenn man aber die Dokumente aus odt importiert, werden die in die zeile vorne drangehangen(zusammen mit 2 tabs)
                elif re.match("\d+\t\t", line[:10]):
                    partition = line.partition("\t\t")
                    if curRdNr != None:
                        RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID)
                        self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID)
                    rdNrContent = [partition[2].strip()]
                    curRdNr = int(partition[0])
                else:
                    rdNrContent.append(line)
            self.fileStatusUpdated.emit(self.readChars, size)
Ejemplo n.º 2
0
def readDocFile(fName):
    docF = DocReader(fName)
    rawCont = docF.read().decode("latin-1").encode("utf-8")
    contentParted = rawCont.partition(NEW_LINE)
    caption = contentParted[0]
    content = contentParted[2]

    # createDirIfNeeded(path.join(PLAIN_FOLDER, caption))

    paragraphs = content.split(u"%s\x0c" % (NEW_LINE))

    cRESULT = {}
    # nRESULT = {}
    for rawPara in paragraphs:
        para = Paragraph(rawPara, caption)
        for k in para.rdWordMap:
            for w in para.rdWordMap[k].capWords.values():
                cRESULT = addWord(cRESULT, w)
            # for w in para.rdWordMap[k].normalWords.values():
            #     nRESULT = addWord(nRESULT, w)

    return cRESULT
Ejemplo n.º 3
0
# Miette is "small sweet thing" in french

from cfb.reader import CfbReader
from doc.reader import DocReader
from tools import hex_dump

r = DocReader('../tests/doc/mw_lorem_ipsum.doc')
#r = DocReader('../tests/doc/gd_lorem_ipsum.doc')
#r = DocReader('../tests/doc/oo_lorem_ipsum.doc')
#r = DocReader('../tests/doc/te_lorem_ipsum.doc')

#r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/te_vesna_yandex_ru.doc')

print r.read()
Ejemplo n.º 4
0
# Miette is "small sweet thing" in french

from cfb.reader import CfbReader
from doc.reader import DocReader
from tools import hex_dump

# r = DocReader('../tests/doc/mw_lorem_ipsum.doc')
r = DocReader('../tests/doc/P_089-104.doc')
#r = DocReader('../tests/doc/gd_lorem_ipsum.doc')
#r = DocReader('../tests/doc/oo_lorem_ipsum.doc')
#r = DocReader('../tests/doc/te_lorem_ipsum.doc')

#r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc')
#r = DocReader('../tests/doc/te_vesna_yandex_ru.doc')

out = open("out", "w")

out.write(r.read())

out.close()