def save_file(self, filePath): docF = DocReader(filePath) rawCont = docF.read() rawCont = rawCont.decode(CODING) self.readChars = 0 size = len(rawCont) contentParted = rawCont.partition(NEW_LINE) caption = contentParted[0] content = contentParted[2] paraSplitter = '%s\x0c' %(NEW_LINE) paragraphs = content.split(paraSplitter) # print filePath, "got %d paragraphs" %(len(paragraphs)) FILE_ID = self.dbi.addFile(path.basename(filePath), caption) for para in paragraphs: paraCaption, __, paraContent = para.partition(NEW_LINE) self.readChars += len(para) + len(paraSplitter) PARAGRAPH_ID = self.dbi.addParagraph(paraCaption, FILE_ID) curRdNr = None rdNrContent = [] for line in paraContent.split(NEW_LINE): # eigentlich werden randnummern in textboxes als eigene Zeile erkannt... if line.isdigit(): if curRdNr != None: RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID) self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID) rdNrContent = [] curRdNr = int(line) # self.readChars += len(line) + len(NEW_LINE) # ... wenn man aber die Dokumente aus odt importiert, werden die in die zeile vorne drangehangen(zusammen mit 2 tabs) elif re.match("\d+\t\t", line[:10]): partition = line.partition("\t\t") if curRdNr != None: RDNR_ID = self.dbi.addRdNr(curRdNr, PARAGRAPH_ID) self.save_content(rdNrContent, RDNR_ID, PARAGRAPH_ID, FILE_ID) rdNrContent = [partition[2].strip()] curRdNr = int(partition[0]) else: rdNrContent.append(line) self.fileStatusUpdated.emit(self.readChars, size)
def readDocFile(fName): docF = DocReader(fName) rawCont = docF.read().decode("latin-1").encode("utf-8") contentParted = rawCont.partition(NEW_LINE) caption = contentParted[0] content = contentParted[2] # createDirIfNeeded(path.join(PLAIN_FOLDER, caption)) paragraphs = content.split(u"%s\x0c" % (NEW_LINE)) cRESULT = {} # nRESULT = {} for rawPara in paragraphs: para = Paragraph(rawPara, caption) for k in para.rdWordMap: for w in para.rdWordMap[k].capWords.values(): cRESULT = addWord(cRESULT, w) # for w in para.rdWordMap[k].normalWords.values(): # nRESULT = addWord(nRESULT, w) return cRESULT
# Miette is "small sweet thing" in french from cfb.reader import CfbReader from doc.reader import DocReader from tools import hex_dump r = DocReader('../tests/doc/mw_lorem_ipsum.doc') #r = DocReader('../tests/doc/gd_lorem_ipsum.doc') #r = DocReader('../tests/doc/oo_lorem_ipsum.doc') #r = DocReader('../tests/doc/te_lorem_ipsum.doc') #r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/te_vesna_yandex_ru.doc') print r.read()
# Miette is "small sweet thing" in french from cfb.reader import CfbReader from doc.reader import DocReader from tools import hex_dump # r = DocReader('../tests/doc/mw_lorem_ipsum.doc') r = DocReader('../tests/doc/P_089-104.doc') #r = DocReader('../tests/doc/gd_lorem_ipsum.doc') #r = DocReader('../tests/doc/oo_lorem_ipsum.doc') #r = DocReader('../tests/doc/te_lorem_ipsum.doc') #r = DocReader('../tests/doc/mw_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/gd_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/oo_vesna_yandex_ru.doc') #r = DocReader('../tests/doc/te_vesna_yandex_ru.doc') out = open("out", "w") out.write(r.read()) out.close()