def load_corpus(directory): texts = {} docs = {} for f in os.listdir(directory): print 'Loading: ', directory + f if f.endswith("txt8"): with codecs.open(directory + f, 'r', 'ascii', 'ignore') as text: texts[f[:-1]] = text.read() elif f.endswith('docx'): d = docx.clean(docx.opendocx(directory + f)) # converts to nltk text object docs[f] = flatten(docx.getdocumenttext(d)) return texts, docs
'color': 'auto', 'space': 0, 'sz': 6, 'val': 'single', }, }, 'celstyle': [ {'align': 'center'}, {'align': 'left'}, {'align': 'right'}, ], 'headstyle': { 'fill':'C6D9F1', 'themeFill':None, 'themeFillTint':None }, }) # Cleaning docbody = docx.clean(docbody) # ------------------------------ # Save output # ------------------------------ # Prepare output file outfile = zipfile.ZipFile('out.docx',mode='w',compression=zipfile.ZIP_DEFLATED) # Copy unmodified sections for f in template.namelist(): if not f in map(lambda i: i[0], actlist): fo = template.open(f,'rU') data = fo.read() outfile.writestr(f,data) fo.close()