def getresourcelink(): message = request.args.get('message') try: d = docx.opendocx("./app/static/doc/%s_1.docx" % message) except: flag = False if flag is False: d = docx.opendocx("./app/static/doc/%s_1.doc" % message) doc = docx.getdocumenttext(d) pic_name = list() for file in os.listdir("./app/static/img/"): file_path = os.path.join("./app/static/img/", file) if message in file_path: pic_name.append(file_path[6:]) vid_name = list() for file in os.listdir("./app/static/vid/"): file_path = os.path.join("./app/static/vid/", file) if message in file_path: vid_name.append(file_path[6:]) print(vid_name) print(pic_name) datai = {"name": message, "piclink": pic_name, "vidlink": vid_name} datai['info'] = doc data = json.dumps(datai, ensure_ascii=False) return data
def main(): infil = opendocx(sys.argv[1]) outfil = open(sys.argv[2], 'w') paragraphs = getdocumenttext(opendocx('a.docx')) # For Unicode handling. new_paragraphs = [] for paragraph in paragraphs: new_paragraphs.append(paragraph.encode("utf-8")) open('output.txt', 'w').write('\n'.join(new_paragraphs))
def getTheText(fileNameInput, newfile, fileType): # This is the functiont that acutally opens the respective xml file # and reads and converts the text status = ' found' try: # open the respective xml file document = opendocx(fileNameInput, fileType) # extract the text from the xml file paratextlist = getdocumenttext(document) # if any text is found, make it unicode and write it to file if len(paratextlist) > 0: # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8") + '\n') # Write the text to file newfile.write(''.join(newparatextlist) + '\n\n') except: # if the xml file isn't found status = ' not found' return status
def getTheText(fileNameInput,newfile,fileType): # This is the functiont that acutally opens the respective xml file # and reads and converts the text status = ' found' try : # open the respective xml file document = opendocx(fileNameInput,fileType) # extract the text from the xml file paratextlist = getdocumenttext(document) # if any text is found, make it unicode and write it to file if len(paratextlist) > 0 : # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")+'\n') # Write the text to file newfile.write(''.join(newparatextlist)+'\n\n') except : # if the xml file isn't found status = ' not found' return status
def document_to_text(self, filename, file_path): if filename[-4:] == ".doc": cmd = ['antiword', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() self.raw = stdout.decode('ascii', 'ignore') elif filename[-5:] == ".docx": document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) self.raw = '\n\n'.join(newparatextlist) elif filename[-4:] == ".odt": cmd = ['odt2txt', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() self.raw = stdout.decode('ascii', 'ignore') elif filename[-4:] == ".pdf": self.raw = self.convert_pdf_to_txt(file_path) elif filename[-4:] == ".txt": with open(file_path, 'r') as file_: self.raw = file_.read()
def document_to_text(filename, file_path): output = open('/home/yung/download/output_odt.txt','w') if filename[-4:] == ".doc": cmd = ['antiword', file_path] p = Popen(cmd, stdout = PIPE) stdout, stderr = p.communicate() temp_odt = stdout.decode('ascii','ignore') return temp_odt elif filename[-5:] == ".docx": document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: temp_docx = paratext.encode("utf-8") newparatextlist.append(temp_docx) output.write(temp_docx) return '\n\n'.join(newparatextlist) elif filename[-4:] == ".odt": cmd = ['odt2txt',file_path] p = Popen(cmd, stdout = PIPE) stdout, stderr = p.communicate() temp_odt = stdout.decode('ascii','ignore') output.write(temp_odt) return temp_odt else: print "Can't convert" output.close()
def document_to_text(path): ext = os.path.splitext(path)[1] filename = os.path.basename(path) if ext == ".doc": cmd = os.path.dirname(os.path.realpath(__file__)) + os.sep + "antiword" + os.sep + "antiword.exe -m CP852 " + path pipe = Popen(cmd, stdout=PIPE, shell=True) text = pipe.communicate()[0] return filename, unicode(re.sub("\r|\n", "", text).strip()) elif ext == ".docx": document = opendocx(path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext) return filename, "".join(newparatextlist) elif ext == ".pdf": text = convert_pdf_to_txt(path) return filename, unicode(re.sub("\r|\n", "", text).strip()) else: text = unicode(open(path, "r").read()).strip() return filename, text
def readdocx(f,stream_reader): f = stream_to_file(f,stream_reader) if not f: return None d = docx.opendocx(f) txt = ''.join(x for x in docx.getdocumenttext(d)) print 'reading docx',f,txt return txt
def gen(): try: import docx except: print "Please type the command 'pip install docx' to continue" file_name = raw_input("Enter the file name to randomise words from: ") if file_name.endswith("docx"): acc = open(file_name, "r") file_obj = acc.xreadlines() open_docx = docx.opendocx(file_obj) data = docx.getdocumenttext(open_docx) random.shuffle(data) for i in data: if i == "\n": pass else: print i time.sleep(0.5) else: acc = open(file_name, "r") li = acc.readlines() acc.close() random.shuffle(li) for i in li: if i == "\n": pass else: print i time.sleep(0.5)
def wordToText(path): temp = os.path.splitext(path) print path inputFile = path outputFile = temp[0] +"1.txt" print outputFile #inputFile = os.path.abspath(inputFile) print inputFile try: document = opendocx(inputFile) newfile = open(outputFile, 'w') except: print( "Please supply an input and output file. For example:\n" " example-extracttext.py 'My Office 2007 document.docx' 'outp" "utfile.txt'" ) exit() # Fetch all the text out of the document we just created paratextlist = getdocumenttext(document) #print paratextlist # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) #print paratext # Print out text of document with two newlines under each paragraph newfile.write('\n\n'.join(newparatextlist))
def document_to_text(self,filename, file_path): if filename[-4:] == ".doc": cmd = ['antiword', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() with open(filename[:-4]+".txt","w") as f: f.write(stdout.decode("ascii","ignore")) return stdout.decode('ascii', 'ignore') elif filename[-5:] == ".docx": document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) text = '\n\n'.join(newparatextlist) with open(filename[:-5]+".txt","w") as f: f.write(text) return '\n\n'.join(newparatextlist) elif filename[-4:] == ".odt": cmd = ['odt2txt', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() with open(filename[:-4]+".txt","w") as f: f.write(stdout.decode("ascii","ignore")) return stdout.decode('ascii', 'ignore') elif filename[-4:] == ".pdf": return self.convert_pdf_to_txt(filename,file_path) elif filename[-4:] == ".xlsx": self.csv_from_excel(file_path) csv = file_path.split(".")[0]+".txt" print csv self.txt_from_csv(csv)
def traverse3(self): for root, dirs, files in os.walk(self.path): for file in files: if file.endswith(".docx"): document = opendocx(os.path.join(root, file)) name = os.path.join(root, file).split("/")[-1].split(".")[0] nl = name.split() fname = nl[-2].strip() lname = nl[-1].strip() query = member.objects.filter( name__icontains=fname).filter(name__icontains=lname) if query.count() == 0: print fname, lname else: print "CCount 111" m = query[0] paratextlist = getdocumenttext(document) str = '' for paratext in paratextlist: str = str + '\n\r' + paratext m.description = str m.save()
def docx_to_pdf(infilename, outfilename): # Extract the text from the DOCX file object infile and write it to # a PDF file. try: infil = opendocx(infilename) except: print("Error opening infilename") #print "Exception: " + repr(e) + "\n" sys.exit(1) paragraphs = getdocumenttext(infil) pw = PDFWriter(outfilename) pw.setFont("Courier", 12) pw.setHeader("DOCXtoPDF - convert text in DOCX file to PDF") pw.setFooter("Generated by xtopdf and python-docx") wrapper = TextWrapper(width=70, drop_whitespace=False) # For Unicode handling. new_paragraphs = [] for paragraph in paragraphs: new_paragraphs.append(paragraph.encode("utf-8")) for paragraph in new_paragraphs: lines = wrapper.wrap(paragraph) for line in lines: pw.writeLine(line) pw.writeLine("") pw.savePage() pw.close()
def doctoText(filepath): """ returns a string of text from the input file. created the if statement for future file formats. link below provided partial code. http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/ """ if filepath[-4:] == ".pdf": return convertpdftoText(filepath) elif filepath[-5:] == ".docx": document = opendocx(filepath) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return "\n\n".join(newparatextlist) else: with open(filepath, "rb") as myfile: try: # cleans html, removes tags htmldata = myfile.read() edata = htmldata.decode("utf-8", "strict") raw = BeautifulSoup(edata).get_text() cleanedhtml = raw.encode("utf-8", "strict") return cleanedhtml except: data = myfile.read() return str(data)
def recogOrgnz(num, docx_name, seg): # 根据论文读取对应word文件并识别其中的信息 if not os.path.exists(docx_name): seg[5] = u'[ERROR_NO_FILE]' else: doc = docx.opendocx(docx_name) doc_text = '' for paragh in docx.getdocumenttext(doc): if paragh.find('Abstract') != 0: doc_text += paragh.replace('\n', ';').replace(u'\u2021', '').replace(u'\u2020', '').replace('\t', ';')+';' else: break authors = seg[4].split(';') # print title first_author = authors[0] flag = re.split('[ ,.]+', first_author)[0] # print flag # print doc_text.find(flag) if doc_text.find(flag) != -1: seg[5] = doc_text[doc_text.find(flag):] else: seg[5] = doc_text[doc_text.find(';')+1:] if len(seg[5]) < 5: seg[5] = '[ERROR_FILTER_INFO]' seg[5] = doc_text return u'\t'.join(seg)
def doctoText(filepath): ''' returns a string of text from the input file. created the if statement for future file formats. link below provided partial code. http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/ ''' if filepath[-4:] == ".pdf": return convertpdftoText(filepath) elif filepath[-5:] == ".docx": document = opendocx(filepath) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return '\n\n'.join(newparatextlist) else: with open(filepath, 'rb') as myfile: try: #cleans html, removes tags htmldata = myfile.read() edata = htmldata.decode('utf-8', 'strict') raw = BeautifulSoup(edata).get_text() cleanedhtml = raw.encode('utf-8', 'strict') return cleanedhtml except: data = myfile.read() return str(data)
def parse_docx(full_path): fullpath = "../" + full_path document = opendocx(full_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return '\n\n'.join(newparatextlist)
def openSource(self, custom): """ Открыть документ для работы :param custom: полный путь к файлу :return: [документ, имя_документа] """ return [opendocx(custom), custom]
def convert_doc_to_txt(file_path): from docx import opendocx, getdocumenttext document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return '\n\n'.join(newparatextlist)
def __init__(self, filename, filetype='extension'): #---------------------------------------------------------------------- """ open TextReader file :param filename: name of file to open, or list-like :param filetype: if filename is list, this should be filetype list should be interpreted as """ # if true filename, type of filename is string if type(filename) in [str,unicode]: self.ftype = filename.split('.')[-1].lower() # get extension self.intype = 'file' if self.ftype not in VALIDFTYPES: raise parameterError, 'Invalid filename {}: must have extension in {}'.format(filename,VALIDFTYPES) # otherwise assume 'filename' is list-like else: self.ftype = filetype.lower() self.intype = 'list' if self.ftype not in VALIDLTYPES: raise parameterError, 'Invalid list: must use filetype in {}'.format(VALIDLTYPES) # handle excel files if self.ftype in ['xls','xlsx']: import xlrd self.workbook = xlrd.open_workbook(filename) self.sheet = self.workbook.sheet_by_index(0) # only first sheet is considered self.currrow = 0 self.nrows = self.sheet.nrows self.delimited = True # rows are already broken into columns self.workbook.release_resources() # sheet is already loaded so we can save memory # handle word files elif self.ftype in ['docx']: import docx doc = docx.opendocx(filename) self.lines = iter(docx.getdocumenttext(doc)) self.delimited = False # handle txt files elif self.ftype in ['txt']: if self.intype == 'file': self.TXT = open(filename,'r') else: self.TXT = iter(filename) self.delimited = False # handle txt files elif self.ftype in ['csv']: if self.intype == 'file': self._CSV = open(filename,'rb') else: self._CSV = iter(filename) self.CSV = csv.reader(self._CSV) self.delimited = True self.delimiters = None self.opened = True
def parse_word_file(self, filename): # Read file document = docx.opendocx(filename) text = " ".join(docx.getdocumenttext(document)) self.parse_paragraphs(text) word_split = re.findall(r"[\w']+", text) # Analyse self.parse_text_statistics(word_split)
def parse_file(self, f): doc = opendocx(f) raw_paragraphs = getdocumenttext(doc) paragraphs = [] for p in raw_paragraphs: paragraphs.append(p.encode('utf-8')) parsed_contents = {'text': '\n'.join(paragraphs)} return parsed_contents
def main(argv=None): #Load configuration f_config = ['.workcal/config', 'workcal.ini'] #TODO: make generic? config = configparser.SafeConfigParser() found = config.read(f_config) if found: global USRTZ USRTZ = pytz.timezone(config.get('core', 'timezone')) callst = config.get('core', 'call-start').split(':') callen = config.get('core', 'call-end').split(':') global CALL_START_TIME CALL_START_TIME = time(int(callst[0]), int(callst[1])) global CALL_END_TIME CALL_END_TIME = time(int(callen[0]), int(callen[1])) global MARK_BEGIN MARK_BEGIN = config.get('core', 'file-start') else: config.add_section('core') config.set('core', 'timezone', 'UTC') config.set('core', 'call-start', '16:30') config.set('core', 'call-end', '7:30') config.set('core', 'file-start', 'PICU SCHEDULE') if platform.system() == 'Windows': with open('workcal.ini', 'wb') as configfile: config.write(configfile) else: os.mkdir('.workcal') with open('.workcal/config', 'wb') as configfile: config.write(configfile) #Command line setup parser = argparse.ArgumentParser(prog='workcal', description='Process emailed Word file containing work schedule.', epilog="Ex:\n\tworkcal -f dec.docx -p Tanaka") parser.add_argument('-i', '--ics', help='output to iCalendar format [RFC 5545]', action='store_true') parser.add_argument('-f', '--file', help='file to get calendar information from', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('-p', '--person', help='schedule for person requested, otherwise all events listed', default='Tanaka') args = parser.parse_args() #Open file try: document = opendocx(args.file) paratextlist = getdocumenttext(document) except KeyError, TypeError: #File is not docx with open(args.file.name, 'rb') as f: document = f.read() ldoc = re.split('\r|\x07', document) #FRAGILE start = ldoc.index(MARK_BEGIN) #FRAGILE preparatextlist = [x for x in ldoc[start:] if not re.search('[\x00-\x1f|\x7f-\xff]', x)] paratextlist = [x for x in preparatextlist if x != ''] #FRAGILE
def ParseDOCX(doc): document = opendocx(doc) paratextlist = getdocumenttext(document) newparatextlist = [] html = str() for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) html = html.join(newparatextlist) return ParseHTML(html, True)
def read_docx(post_path): ''' read and write docx :param post_path: :return: title,text_input ''' document = docx.opendocx(post_path) paratextlist = docx.getdocumenttext(document) print(paratextlist)
def loaddoc(request): try: doc = StringIO() doc.write(request.FILES['file'].read()) document = opendocx(doc) except Exception, e: #fix this later print e
def main(): if len(sys.argv) != 3: print(usage()) sys.exit(1) infil = opendocx(sys.argv[1]) outfil = open(sys.argv[2], 'w') extract_docx_text(infil, outfil)
def document_to_text(filename, file_path): print("inside doctotxt") document = opendocx(file_path + filename + ".docx") paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) os.remove(file_path + filename + ".docx") return '\n\n'.join(newparatextlist)
def convert_docx_new(fileori): if opendocx is None: return "" try: document = opendocx(StringIO.StringIO(fileori)) paratextlist = getdocumenttext(document) return "\n".join(paratextlist) except: return "" pass
def convert_docx_to_text(filename=None, blob=None): """Pass either a filename or a binary object.""" # docx.opendocx(file) uses zipfile.ZipFile, which can take either a # filename or a file-like object # https://github.com/mikemaccana/python-docx/blob/master/docx.py # https://docs.python.org/2/library/zipfile.html with get_filelikeobject(filename, blob) as fp: document = docx.opendocx(fp) paratextlist = docx.getdocumenttext(document) return '\n\n'.join(paratextlist)
def docx_to_txt(p, n): #docx转txt file = docx.opendocx(p) text = docx.getdocumenttext(file) file = open(path1 + '\\' + '湖南-无认罪认罚' + str(n) + '.txt', 'w', encoding="utf-8") for i in range(len(text)): file.write(str(text[i])) file.write('\n') file.close()
def get_text_from_docx(self, filename): """ use doc module python doc """ document = opendocx(filename) paratextlist = getdocumenttext(document) newparatextlist = [ paratext.encode("utf-8") for paratext in paratextlist ] return ' '.join(newparatextlist)
def docx_to_pdf(infilename, outfilename): # Extract the text from the DOCX file object infile and write it to # a PDF file. try: infil = opendocx(infilename) except Exception, e: print "Error opening infilename" print "Exception: " + repr(e) + "\n" sys.exit(1)
def get_trans_txt(path): document = opendocx(path) document_txt = getdocumenttext(document) para_text_list = [] for para_text in document_txt: para_text_list.append(para_text.encode("utf-8")) as_txt = '\n\n'.join(para_text_list) return as_txt
def convert_docx_to_txt(self,f): output = open(outtxt,'w') document = docx.opendocx(str(f)) paratextlist = docx.getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode('utf-8')) print "Convertion docx to txt is Successfully finished" strg = '\n\n'.join(newparatextlist) output.write(strg) output.close()
def handle_docx(input_path, output_path): try: document = opendocx(input_path) newfile = open(output_path, "w") except: return "Unable to open the file" paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return "\n".join(newparatextlist)
def count_docx(file_name): try: document = docx.opendocx(file_name) except: print('Cannot open file to read.') exit() paratextlist = docx.getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return len(re.findall(r'\w+', '\n'.join(newparatextlist)))
def docx(self, path): """ Method for extracting text data from docx files Input: full path to docx file Output: extracted text """ try: docx = opendocx(os.path.join(DATA_DIR, path)) text = '\n'.join([page for page in getdocumenttext(docx)]) except Exception as e: self.log.exception(e) else: return text
def load_corpus(directory): texts = {} docs = {} for f in os.listdir(directory): print 'Loading: ', directory + f if f.endswith("txt8"): with codecs.open(directory + f, 'r', 'ascii', 'ignore') as text: texts[f[:-1]] = text.read() elif f.endswith('docx'): d = docx.clean(docx.opendocx(directory + f)) # converts to nltk text object docs[f] = flatten(docx.getdocumenttext(d)) return texts, docs
def _docx_to_txt(file_path, dst_dir): """ Uses the pdftotxt unix util, with --layout option, to convert file_name to .txt and save in dst_dir """ file_name = os.path.split(file_path)[1] file_dst = os.path.join(dst_dir, re.sub(r'\.docx$', '.txt', file_name)) doc = opendocx(file_path) txt = '\n'.join(getdocumenttext(doc)) txt = unidecode(txt) with open(file_dst, 'w') as f: f.write(txt) return 0
def file_list(): vr = {} for f in os.listdir(u'C:/Users/eliav/Downloads/Rosh_on_Talmud-2015-03-31/Rosh on Talmud'): b="" print f pf = os.path.join(u'C:/Users/eliav/Downloads/Rosh_on_Talmud-2015-03-31/Rosh on Talmud', f) if len(f.split("."))>1 and f.split(".")[1] == "docx": document = docx.opendocx(pf) for a in docx.getdocumenttext(document): b = b + a vr[f.split(".")[0]] = b files_list = vr.keys() return files_list, vr
def queryPB(self): self.plainTextEdit.clear() pwd = os.getcwd() address = pwd + "\\source" filenames=os.listdir(address) for filename in filenames: filepath = address+'\\'+filename ext = os.path.splitext(filepath)[1] if ext == ".docx": document = docx.opendocx(filepath) #打开文件demo.docx docx_lines = docx.getdocumenttext(document) docx_lines_num = len(docx_lines) elif ext == ".md": document = open(filepath,'r',encoding = "utf-8") #打开文件demo.docx docx_lines = document.readlines() docx_lines_num = len(docx_lines) print(docx_lines_num) last_head = 0 last_tail = 0 for index,line in enumerate(docx_lines): if self.lineEdit.text() in line or self.lineEdit.text().capitalize() in line: # print(line) string = [] string.append("文件位置:" + filepath + "\n") now_index_head = index now_index_tail = index while True: now_index_head = now_index_head - 1 if "来源" in docx_lines[now_index_head]: # print("来源头:" + str(now_index_head)) break while True: now_index_tail = now_index_tail + 1 if "来源" in docx_lines[now_index_tail]: # print("来源尾:" + str(now_index_tail)) break if now_index_tail == (docx_lines_num -1): # print("来源尾1:" + str(now_index_tail)) now_index_tail = now_index_tail + 1 break if (last_head != now_index_head) or (last_tail != now_index_tail): last_head = now_index_head last_tail = now_index_tail for i in docx_lines[now_index_head:now_index_tail]: #print(i) string.append(i + "\n") #print("\n") string.append("\n") #print("".join(string)) # self.plainTextEdit.append("".join(string)) self.plainTextEdit.appendPlainText("".join(string))
def _docx_to_txt(file_path, dst_dir, file_name): """ Uses the docx python module to extract text from a docx file and save to .txt in dst_dir. """ if file_name is None: file_name = os.path.split(file_path)[1] file_dst = os.path.join(dst_dir, re.sub(r'\.docx$', '.txt', file_name)) doc = opendocx(file_path) txt = '\n'.join(getdocumenttext(doc)) txt = unidecode(txt) with open(file_dst, 'w') as f: f.write(txt) return 0
def main(): if len(sys.argv) != 3: print (usage()) sys.exit(1) try: infil = opendocx(sys.argv[1]) outfil = open(sys.argv[2], 'w') except Exception: # print("Exception: " + repr(e) + "\n") sys.exit(1) extract_docx_text(infil, outfil)
def main(): reports = os.listdir( "./data" ) offices = [] topics = [] units = [] #import pdb #pdb.set_trace() for report in reports: files = os.listdir( "./data/%s" % ( report, ) ) for file in files: print "./data/%s/%s" % (report, file) paragraphs = docx.getdocumenttext(docx.opendocx("./data/%s/%s" % (report, file))) results = [] flag = False type = -1 #type uninitialized office_name = get_ref( paragraphs[0].strip(), offices ) #First paragraph is the office topic = get_ref( paragraphs[1].strip(), topics ) #Second paragraph is the report topic #Third paragraph is the inspected units #TODO: More intelligent unit extraction if u"הגופים המבוקרים:" in paragraphs[2]: inspected = paragraphs[2].split(":")[1].strip().split(";") new_inspected = [] for inspectee in inspected: inspectee = inspectee.replace(u"\u2014","-").split("-") inspectee = [i.strip() for i in inspectee] inspectee_name = inspectee[0] if len(inspectee) == 1 else inspectee[1] unit = filter( lambda x: x["name"] == inspectee_name, units) if len( unit ) == 0: office = "" if len( inspectee ) == 1 else get_ref( inspectee[0].strip(), offices ) units.append( { "name": inspectee_name, "slug": hash( inspectee_name ), "office": office } ) new_inspected.append( units[-1]["name"] ) #TODO: use slug instead of name else: new_inspected.append( unit[0]["name"] ) #TODO: use slug instead of name for paragraph in paragraphs: paragraph = paragraph.strip() if paragraph in _POI: flag = True results.append( { "id": 0, "type": 0, "status": 0, "text": "", "followup": "", "link": "", "report": report, "unit": "", "topic": topic.encode("utf-8"), "office": office_name.encode("utf-8") } ) type = _POI.index(paragraph) elif flag: if re.match("(\d+)\.?", paragraph): results[-1]["id"] = int(re.match("(\d+)\.?", paragraph).groups(1)[0]) results[-1][_POI_TYPES[type]] += re.sub("(\d+)\.?", "", paragraph).encode("utf-8") if len(paragraph) > 1 else "" #TODO: Properly compute status json.dumps(results, open("./results.json", "w+"))
def main(): files = os.listdir( "./data" ) result = codecs.open("./result.txt", "w+", "utf-8") offices = [] topics = [] units = [] for file in files: paragraphs = docx.getdocumenttext(docx.opendocx("./data/%s" % (file, ))) results = [] flag = False type = -1 #type uninitialized #TODO: Switch to using plain lists for offices, topics & units office_name = get_ref( paragraphs[0].strip(), offices ) #First paragraph is the office topic = get_ref( paragraphs[1].strip(), topics ) #Second paragraph is the report topic #Third paragraph is the inspected units #TODO: Insure the paragraph is a list of inspectees, and clean the header #TODO: Smarter filtering for units and offices inspected = paragraphs[2].strip().split(";") new_inspected = [] for inspectee in inspected: inspectee = inspectee.split("-") inspectee = [i.strip() for i in inspectee] inspectee_name = inspectee[0] if len(inspectee) == 1 else inspectee[1] unit = filter( lambda x: x["name"] == inspectee_name, units) if len( unit ) == 0: office = "" if len( inspectee ) == 1 else get_ref( inspectee[0].strip(), offices ) units.append( { "name": inspectee_name, "slug": hash( inspectee_name ), "office": office } ) new_inspected.append( units[-1]["name"] ) else: new_inspected.append( unit[0]["name"] ) for paragraph in paragraphs: paragraph = paragraph.strip() if paragraph in _POI: flag = True results.append( { "id": 0, "type": 0, "status": 0, "text": "", "followup": "", "link": "", "report": "", "unit": "", "topic": topic.encode("utf-8"), "office": office_name.encode("utf-8") } ) type = _POI.index(paragraph) elif flag: if re.match("(\d+)\.?", paragraph): results[-1]["id"] = int(re.match("(\d+)\.?", paragraph).groups(1)[0]) results[-1][_POI_TYPES[type]] += re.sub("(\d+)\.?", "", paragraph).encode("utf-8") if len(paragraph) > 1 else "" #TODO: Properly compute status for entry in results: for inspectee in new_inspected: update_table.insert_row( (entry["id"], entry["type"], entry["status"], entry["text"], entry["followup"], entry["link"], entry["report"], inspectee.encode("utf-8"), entry["topic"], entry["office"]) )
def document_to_text(filename, file_path): if filename[-4:] == ".doc": cmd = ['antiword', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() return stdout.decode('ascii', 'ignore') elif filename[-5:] == ".docx": document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return '\n\n'.join(newparatextlist) elif filename[-4:] == ".pdf": return convert_pdf_to_txt(file_path)
def convDocx(portfolios_dir, f): docx_filepath = os.path.join(portfolios_dir, f) docu = opendocx(docx_filepath) paratextlist = getdocumenttext(docu) doc_dict = {'doctype': "report", 'origin': "shareworks"} doc_dict['_id'] = f doc_dict['student_email'] = email_from_fname(f) doc_dict['content'] = [] for pair in pairwise(paratextlist): if len(pair[0]) / len(pair[1]) < 1.5: #assume header is not much longer than text doc_dict['content'].append({'header': pair[0], 'text': "\n\n".join(pair)}) if len(doc_dict['content']) == 0: doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}] return doc_dict
def docxExtract(docxfile): try: document = opendocx(docxfile) except: print "Error opening docx" exit() # Fetch all the text out of the document we just created paratextlist = getdocumenttext(document) # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) # Print out text of document with two newlines under each paragraph return '\n'.join(newparatextlist)
def readdocx(): '''Microsoft document reader''' did = request.vars.did highlight = request.vars.h if not highlight: highlight = "VixenServer" fdb = db(db.vfile.uuid == did).select().last() upfolder = '%sstatic/uploads' % request.folder frpath = '%s/%s' % (upfolder, fdb.name) fpath = os.path.abspath(frpath) if fdb and fdb.ext == 'txt': if os.path.isfile(fpath): data = open(fpath, 'r').read() data = '<p style="padding:5px;">{}<p'.format(data) return data if fdb and fdb.ext == 'docx': if os.path.isfile(fpath): document = docx.opendocx(fpath) paratextlist = docx.getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) asc = string.ascii_lowercase lang = 'en' #print newparatextlist[0][0] if newparatextlist and not newparatextlist[0][0].lower() in asc: lang = 'fa' data = '<br/>'.join(newparatextlist).replace( highlight, '<b style="background:yellow;color:darkgreen">{}</b>'.format( highlight)) data = '<div style="padding:5px;">{}</div>'.format(data) response.title = 'Document: %s' % fdb.rawname if not request.vars.raw == 'true': return dict(name=fdb.rawname, data=XML(data), lang=lang) else: if lang == 'fa': style = "padding:2px;background:#fff;direction:rtl;font-family:terafik;font-size:10px;" else: style = "padding:2px;background:#fff;font-family:terafik;font-size:10px;" return '<div style="%s">%s</div>' % (style, XML(data)) else: session.flash = 'Error reading document!' redirect(request.env.http_referer)