Exemple #1
0
def getresourcelink():
    message = request.args.get('message')
    try:
        d = docx.opendocx("./app/static/doc/%s_1.docx" % message)
    except:
        flag = False

    if flag is False:
        d = docx.opendocx("./app/static/doc/%s_1.doc" % message)

    doc = docx.getdocumenttext(d)

    pic_name = list()
    for file in os.listdir("./app/static/img/"):
        file_path = os.path.join("./app/static/img/", file)
        if message in file_path:
            pic_name.append(file_path[6:])
    vid_name = list()
    for file in os.listdir("./app/static/vid/"):
        file_path = os.path.join("./app/static/vid/", file)
        if message in file_path:
            vid_name.append(file_path[6:])
    print(vid_name)
    print(pic_name)
    datai = {"name": message, "piclink": pic_name, "vidlink": vid_name}
    datai['info'] = doc
    data = json.dumps(datai, ensure_ascii=False)

    return data
Exemple #2
0
def main():
    infil = opendocx(sys.argv[1])
    outfil = open(sys.argv[2], 'w')
    paragraphs = getdocumenttext(opendocx('a.docx'))

    # For Unicode handling.
    new_paragraphs = []
    for paragraph in paragraphs:
        new_paragraphs.append(paragraph.encode("utf-8"))

    open('output.txt', 'w').write('\n'.join(new_paragraphs))
Exemple #3
0
def getTheText(fileNameInput, newfile, fileType):
    # This is the functiont that acutally opens the respective xml file
    # and reads and converts the text

    status = ' found'

    try:

        # open the respective xml file
        document = opendocx(fileNameInput, fileType)

        # extract the text from the xml file
        paratextlist = getdocumenttext(document)

        # if any text is found, make it unicode and write it to file

        if len(paratextlist) > 0:
            # Make explicit unicode version
            newparatextlist = []
            for paratext in paratextlist:
                newparatextlist.append(paratext.encode("utf-8") + '\n')

            # Write the text to file
            newfile.write(''.join(newparatextlist) + '\n\n')

    except:

        # if the xml file isn't found
        status = ' not found'

    return status
def getTheText(fileNameInput,newfile,fileType):
    # This is the functiont that acutally opens the respective xml file
    # and reads and converts the text

    status = ' found'

    try :

        # open the respective xml file
        document = opendocx(fileNameInput,fileType)

        # extract the text from the xml file
        paratextlist = getdocumenttext(document)
        
        # if any text is found, make it unicode and write it to file

        if len(paratextlist) > 0 :
            # Make explicit unicode version
            newparatextlist = []
            for paratext in paratextlist:
                newparatextlist.append(paratext.encode("utf-8")+'\n')

            # Write the text to file
            newfile.write(''.join(newparatextlist)+'\n\n')

    except :

        # if the xml file isn't found
        status = ' not found'

    return status
	def document_to_text(self, filename, file_path):
		if filename[-4:] == ".doc":
        		cmd            = ['antiword', file_path]
        		p              = Popen(cmd, stdout=PIPE)
        		stdout, stderr = p.communicate()
        		self.raw       = stdout.decode('ascii', 'ignore')
    		
		elif filename[-5:] == ".docx":
        		document        = opendocx(file_path)
        		paratextlist    = getdocumenttext(document)
        		newparatextlist = []
        		for paratext in paratextlist:
        			 newparatextlist.append(paratext.encode("utf-8"))
       			self.raw = '\n\n'.join(newparatextlist)
    		
		elif filename[-4:] == ".odt":
        		cmd            = ['odt2txt', file_path]
        		p              = Popen(cmd, stdout=PIPE)
        		stdout, stderr = p.communicate()
        		self.raw       = stdout.decode('ascii', 'ignore')
    	
		elif filename[-4:] == ".pdf":
        		self.raw = self.convert_pdf_to_txt(file_path)
		
		elif filename[-4:] == ".txt":
			with open(file_path, 'r') as file_:
				self.raw = file_.read()
Exemple #6
0
def document_to_text(filename, file_path):
	output = open('/home/yung/download/output_odt.txt','w')
	if filename[-4:] == ".doc":
		cmd = ['antiword', file_path]
		p = Popen(cmd, stdout = PIPE)
		stdout, stderr = p.communicate()
		temp_odt = stdout.decode('ascii','ignore')
		return temp_odt
	elif filename[-5:] == ".docx":
		document = opendocx(file_path)
		paratextlist = getdocumenttext(document)
		newparatextlist = []
		for paratext in paratextlist:
			temp_docx = paratext.encode("utf-8")
			newparatextlist.append(temp_docx)
			output.write(temp_docx)
		return '\n\n'.join(newparatextlist)
	elif filename[-4:] == ".odt":
		cmd = ['odt2txt',file_path]
		p = Popen(cmd, stdout = PIPE)
		stdout, stderr = p.communicate()
		temp_odt = stdout.decode('ascii','ignore')
		output.write(temp_odt)
		return temp_odt
	else:
		print "Can't convert"
	output.close()
Exemple #7
0
def document_to_text(path):
    ext = os.path.splitext(path)[1]
    filename = os.path.basename(path)

    if ext == ".doc":
        cmd = os.path.dirname(os.path.realpath(__file__)) + os.sep + "antiword" + os.sep + "antiword.exe -m CP852 " + path
        pipe = Popen(cmd, stdout=PIPE, shell=True)
        text = pipe.communicate()[0]
        return filename, unicode(re.sub("\r|\n", "", text).strip())

    elif ext == ".docx":
        document = opendocx(path)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext)
        return filename, "".join(newparatextlist)

    elif ext == ".pdf":
        text = convert_pdf_to_txt(path)
        return filename, unicode(re.sub("\r|\n", "", text).strip())

    else:
        text = unicode(open(path, "r").read()).strip()
        return filename, text
Exemple #8
0
def readdocx(f,stream_reader):
    f = stream_to_file(f,stream_reader)
    if not f: return None
    d = docx.opendocx(f)
    txt = ''.join(x for x in docx.getdocumenttext(d))
    print 'reading docx',f,txt
    return txt
Exemple #9
0
def gen():
    try:
        import docx
    except:
        print "Please type the command 'pip install docx' to continue"
    file_name = raw_input("Enter the file name to randomise words from: ")

    if file_name.endswith("docx"):
        acc = open(file_name, "r")
        file_obj = acc.xreadlines()
        open_docx = docx.opendocx(file_obj)
        data = docx.getdocumenttext(open_docx)
        random.shuffle(data)
        for i in data:
            if i == "\n":
                pass
            else:
                print i
                time.sleep(0.5)

    else:
        acc = open(file_name, "r")
        li = acc.readlines()
        acc.close()

        random.shuffle(li)
        for i in li:
            if i == "\n":
                pass
            else:
                print i
                time.sleep(0.5)
Exemple #10
0
def wordToText(path):
	temp = os.path.splitext(path)
	print path
	inputFile = path
	outputFile = temp[0] +"1.txt"
	print outputFile
	#inputFile = os.path.abspath(inputFile)
	print inputFile

	try:
		document = opendocx(inputFile)
		newfile = open(outputFile, 'w')
	except:
		print(
		"Please supply an input and output file. For example:\n"
		"  example-extracttext.py 'My Office 2007 document.docx' 'outp"
		"utfile.txt'"
		)
		exit()

	# Fetch all the text out of the document we just created
	paratextlist = getdocumenttext(document)
	#print paratextlist

	# Make explicit unicode version
	newparatextlist = []
	for paratext in paratextlist:
		newparatextlist.append(paratext.encode("utf-8"))
		#print paratext

	# Print out text of document with two newlines under each paragraph
	newfile.write('\n\n'.join(newparatextlist))
Exemple #11
0
 def document_to_text(self,filename, file_path):
     if filename[-4:] == ".doc":
         cmd = ['antiword', file_path]
         p = Popen(cmd, stdout=PIPE)
         stdout, stderr = p.communicate()
         with open(filename[:-4]+".txt","w") as f:
             f.write(stdout.decode("ascii","ignore"))
         return stdout.decode('ascii', 'ignore')
     elif filename[-5:] == ".docx":
         document = opendocx(file_path)
         paratextlist = getdocumenttext(document)
         newparatextlist = []
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
         text = '\n\n'.join(newparatextlist)
         with open(filename[:-5]+".txt","w") as f:
             f.write(text)
         return '\n\n'.join(newparatextlist)
     elif filename[-4:] == ".odt":
         cmd = ['odt2txt', file_path]
         p = Popen(cmd, stdout=PIPE)
         stdout, stderr = p.communicate()
         with open(filename[:-4]+".txt","w") as f:
             f.write(stdout.decode("ascii","ignore"))
         return stdout.decode('ascii', 'ignore')
     elif filename[-4:] == ".pdf":
         return self.convert_pdf_to_txt(filename,file_path)
     elif filename[-4:] == ".xlsx":
         self.csv_from_excel(file_path)
         csv = file_path.split(".")[0]+".txt"
         print csv
         self.txt_from_csv(csv)
Exemple #12
0
    def traverse3(self):
        for root, dirs, files in os.walk(self.path):
            for file in files:
                if file.endswith(".docx"):
                    document = opendocx(os.path.join(root, file))
                    name = os.path.join(root,
                                        file).split("/")[-1].split(".")[0]
                    nl = name.split()
                    fname = nl[-2].strip()
                    lname = nl[-1].strip()
                    query = member.objects.filter(
                        name__icontains=fname).filter(name__icontains=lname)

                    if query.count() == 0:
                        print fname, lname
                    else:
                        print "CCount 111"
                        m = query[0]
                        paratextlist = getdocumenttext(document)
                        str = ''

                        for paratext in paratextlist:
                            str = str + '\n\r' + paratext
                        m.description = str
                        m.save()
Exemple #13
0
def docx_to_pdf(infilename, outfilename):

    # Extract the text from the DOCX file object infile and write it to 
    # a PDF file.

    try:
        infil = opendocx(infilename)
    except:
        print("Error opening infilename")
        #print "Exception: " + repr(e) + "\n"
        sys.exit(1)

    paragraphs = getdocumenttext(infil)

    pw = PDFWriter(outfilename)
    pw.setFont("Courier", 12)
    pw.setHeader("DOCXtoPDF - convert text in DOCX file to PDF")
    pw.setFooter("Generated by xtopdf and python-docx")
    wrapper = TextWrapper(width=70, drop_whitespace=False)

    # For Unicode handling.
    new_paragraphs = []
    for paragraph in paragraphs:
        new_paragraphs.append(paragraph.encode("utf-8"))

    for paragraph in new_paragraphs:
        lines = wrapper.wrap(paragraph)
        for line in lines:
            pw.writeLine(line)
        pw.writeLine("")

    pw.savePage()
    pw.close()
Exemple #14
0
def doctoText(filepath):

    """
    returns a string of text from the input file. created the if statement
    for future file formats. link below provided partial code.
    http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/
    """

    if filepath[-4:] == ".pdf":
        return convertpdftoText(filepath)
    elif filepath[-5:] == ".docx":
        document = opendocx(filepath)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
        return "\n\n".join(newparatextlist)
    else:
        with open(filepath, "rb") as myfile:
            try:
                # cleans html, removes tags
                htmldata = myfile.read()
                edata = htmldata.decode("utf-8", "strict")
                raw = BeautifulSoup(edata).get_text()
                cleanedhtml = raw.encode("utf-8", "strict")
                return cleanedhtml
            except:
                data = myfile.read()
                return str(data)
Exemple #15
0
def recogOrgnz(num, docx_name, seg):
	# 根据论文读取对应word文件并识别其中的信息
	if not os.path.exists(docx_name):
		seg[5] = u'[ERROR_NO_FILE]'
	else:
		doc = docx.opendocx(docx_name)
		doc_text = ''
		for paragh in docx.getdocumenttext(doc):
			if paragh.find('Abstract') != 0:
				doc_text += paragh.replace('\n', ';').replace(u'\u2021', '').replace(u'\u2020', '').replace('\t', ';')+';'
			else:
				break
		authors = seg[4].split(';')
		# print title
		first_author = authors[0]
		flag = re.split('[ ,.]+', first_author)[0]
		# print flag
		# print doc_text.find(flag)
		if doc_text.find(flag) != -1:
			seg[5] = doc_text[doc_text.find(flag):]
		else:
			seg[5] = doc_text[doc_text.find(';')+1:]
		if len(seg[5]) < 5:
			seg[5] = '[ERROR_FILTER_INFO]'
		seg[5] = doc_text
	return u'\t'.join(seg)
Exemple #16
0
def doctoText(filepath):
    '''
    returns a string of text from the input file. created the if statement
    for future file formats. link below provided partial code.
    http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/
    '''

    if filepath[-4:] == ".pdf":
        return convertpdftoText(filepath)
    elif filepath[-5:] == ".docx":
        document = opendocx(filepath)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
        return '\n\n'.join(newparatextlist)
    else:
        with open(filepath, 'rb') as myfile:
            try:
                #cleans html, removes tags
                htmldata = myfile.read()
                edata = htmldata.decode('utf-8', 'strict')
                raw = BeautifulSoup(edata).get_text()
                cleanedhtml = raw.encode('utf-8', 'strict')
                return cleanedhtml
            except:
                data = myfile.read()
                return str(data)
Exemple #17
0
def parse_docx(full_path):
    fullpath = "../" + full_path
    document = opendocx(full_path)
    paratextlist = getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    return '\n\n'.join(newparatextlist)
Exemple #18
0
    def openSource(self, custom):
        """
        Открыть документ для работы

        :param custom:  полный путь к файлу
        :return:  [документ, имя_документа]
        """
        return [opendocx(custom), custom]
Exemple #19
0
def convert_doc_to_txt(file_path):
    from docx import opendocx, getdocumenttext
    document = opendocx(file_path)
    paratextlist = getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    return '\n\n'.join(newparatextlist)
Exemple #20
0
    def __init__(self, filename, filetype='extension'):
    #----------------------------------------------------------------------
        """
        open TextReader file
        
        :param filename: name of file to open, or list-like
        :param filetype: if filename is list, this should be filetype list should be interpreted as
        """

        # if true filename, type of filename is string
        if type(filename) in [str,unicode]:
            self.ftype = filename.split('.')[-1].lower() # get extension
            self.intype = 'file'
            if self.ftype not in VALIDFTYPES:
                raise parameterError, 'Invalid filename {}: must have extension in {}'.format(filename,VALIDFTYPES)
        # otherwise assume 'filename' is list-like
        else:
            self.ftype = filetype.lower()
            self.intype = 'list'
            if self.ftype not in VALIDLTYPES:
                raise parameterError, 'Invalid list: must use filetype in {}'.format(VALIDLTYPES)
        
        # handle excel files
        if self.ftype in ['xls','xlsx']:
            import xlrd
            self.workbook = xlrd.open_workbook(filename)
            self.sheet = self.workbook.sheet_by_index(0)    # only first sheet is considered
            self.currrow = 0
            self.nrows = self.sheet.nrows
            self.delimited = True               # rows are already broken into columns
            self.workbook.release_resources()   # sheet is already loaded so we can save memory
            
        # handle word files
        elif self.ftype in ['docx']:
            import docx
            doc = docx.opendocx(filename)
            self.lines = iter(docx.getdocumenttext(doc))
            self.delimited = False
            
        # handle txt files
        elif self.ftype in ['txt']:
            if self.intype == 'file':
                self.TXT = open(filename,'r')
            else:
                self.TXT = iter(filename)
            self.delimited = False
            
        # handle txt files
        elif self.ftype in ['csv']:
            if self.intype == 'file':
                self._CSV = open(filename,'rb')
            else:
                self._CSV = iter(filename)
            self.CSV = csv.reader(self._CSV)
            self.delimited = True
            
        self.delimiters = None
        self.opened = True
    def parse_word_file(self, filename):
        # Read file
        document = docx.opendocx(filename)
        text = " ".join(docx.getdocumenttext(document))
        self.parse_paragraphs(text)
        word_split = re.findall(r"[\w']+", text)

        # Analyse
        self.parse_text_statistics(word_split)
Exemple #22
0
    def parse_file(self, f):
        doc = opendocx(f)
        raw_paragraphs = getdocumenttext(doc)
        paragraphs = []
        for p in raw_paragraphs:
            paragraphs.append(p.encode('utf-8'))

        parsed_contents = {'text': '\n'.join(paragraphs)}
        return parsed_contents
Exemple #23
0
def main(argv=None):
    #Load configuration
    f_config = ['.workcal/config', 'workcal.ini'] #TODO: make generic?
    config = configparser.SafeConfigParser()
    found = config.read(f_config)
    if found:
        global USRTZ
        USRTZ = pytz.timezone(config.get('core', 'timezone'))
        callst = config.get('core', 'call-start').split(':')
        callen = config.get('core', 'call-end').split(':')
        global CALL_START_TIME
        CALL_START_TIME = time(int(callst[0]), int(callst[1]))
        global CALL_END_TIME
        CALL_END_TIME = time(int(callen[0]), int(callen[1]))
        global MARK_BEGIN
        MARK_BEGIN = config.get('core', 'file-start')
    else:
        config.add_section('core')
        config.set('core', 'timezone', 'UTC')
        config.set('core', 'call-start', '16:30')
        config.set('core', 'call-end', '7:30')
        config.set('core', 'file-start', 'PICU SCHEDULE')
        if platform.system() == 'Windows':
            with open('workcal.ini', 'wb') as configfile:
                config.write(configfile)
        else:
            os.mkdir('.workcal')
            with open('.workcal/config', 'wb') as configfile:
                config.write(configfile)

    #Command line setup
    parser = argparse.ArgumentParser(prog='workcal',
        description='Process emailed Word file containing work schedule.',
        epilog="Ex:\n\tworkcal -f dec.docx -p Tanaka")
    parser.add_argument('-i', '--ics',
        help='output to iCalendar format [RFC 5545]',
        action='store_true')
    parser.add_argument('-f', '--file',
        help='file to get calendar information from',
        nargs='?', type=argparse.FileType('r'),
        default=sys.stdin)
    parser.add_argument('-p', '--person',
        help='schedule for person requested, otherwise all events listed',
        default='Tanaka')
    args = parser.parse_args()
    #Open file
    try:
        document = opendocx(args.file)
        paratextlist = getdocumenttext(document)
    except KeyError, TypeError: #File is not docx
        with open(args.file.name, 'rb') as f:
            document = f.read()
        ldoc = re.split('\r|\x07', document) #FRAGILE
        start = ldoc.index(MARK_BEGIN) #FRAGILE
        preparatextlist = [x for x in ldoc[start:] 
            if not re.search('[\x00-\x1f|\x7f-\xff]', x)]
        paratextlist = [x for x in preparatextlist if x != ''] #FRAGILE
Exemple #24
0
def ParseDOCX(doc):
    document = opendocx(doc)
    paratextlist = getdocumenttext(document)
    newparatextlist = []
    html = str()
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    html = html.join(newparatextlist)
    return ParseHTML(html, True)
Exemple #25
0
def read_docx(post_path):
    '''
    read and write docx
    :param post_path:
    :return: title,text_input
    '''
    document = docx.opendocx(post_path)
    paratextlist = docx.getdocumenttext(document)
    print(paratextlist)
Exemple #26
0
def loaddoc(request):

    try:
        doc = StringIO()
        doc.write(request.FILES['file'].read())
        document = opendocx(doc)        

    except Exception, e: #fix this later
        print e
Exemple #27
0
def main():
    if len(sys.argv) != 3:
        print(usage())
        sys.exit(1)

    infil = opendocx(sys.argv[1])
    outfil = open(sys.argv[2], 'w')

    extract_docx_text(infil, outfil)
def document_to_text(filename, file_path):
    print("inside doctotxt")
    document = opendocx(file_path + filename + ".docx")
    paratextlist = getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    os.remove(file_path + filename + ".docx")
    return '\n\n'.join(newparatextlist)
	def parse_word_file(self, filename):
		# Read file
		document = docx.opendocx(filename)
		text = " ".join(docx.getdocumenttext(document))
		self.parse_paragraphs(text)
		word_split = re.findall(r"[\w']+", text)

		# Analyse
		self.parse_text_statistics(word_split)
Exemple #30
0
def convert_docx_new(fileori):
    if opendocx is None:
        return ""
    try:
        document = opendocx(StringIO.StringIO(fileori))
        paratextlist = getdocumenttext(document)
        return "\n".join(paratextlist)
    except:
        return ""
    pass
def convert_docx_to_text(filename=None, blob=None):
    """Pass either a filename or a binary object."""
    # docx.opendocx(file) uses zipfile.ZipFile, which can take either a
    # filename or a file-like object
    #   https://github.com/mikemaccana/python-docx/blob/master/docx.py
    #   https://docs.python.org/2/library/zipfile.html
    with get_filelikeobject(filename, blob) as fp:
        document = docx.opendocx(fp)
        paratextlist = docx.getdocumenttext(document)
    return '\n\n'.join(paratextlist)
def docx_to_txt(p, n):  #docx转txt
    file = docx.opendocx(p)
    text = docx.getdocumenttext(file)
    file = open(path1 + '\\' + '湖南-无认罪认罚' + str(n) + '.txt',
                'w',
                encoding="utf-8")
    for i in range(len(text)):
        file.write(str(text[i]))
        file.write('\n')
    file.close()
    def get_text_from_docx(self, filename):
        """ 
			use doc module
			python doc 
		"""
        document = opendocx(filename)
        paratextlist = getdocumenttext(document)
        newparatextlist = [
            paratext.encode("utf-8") for paratext in paratextlist
        ]
        return ' '.join(newparatextlist)
Exemple #34
0
def docx_to_pdf(infilename, outfilename):

    # Extract the text from the DOCX file object infile and write it to 
    # a PDF file.

    try:
        infil = opendocx(infilename)
    except Exception, e:
        print "Error opening infilename"
        print "Exception: " + repr(e) + "\n"
        sys.exit(1)
def get_trans_txt(path):
    document = opendocx(path)
    document_txt = getdocumenttext(document)

    para_text_list = []
    for para_text in document_txt:
        para_text_list.append(para_text.encode("utf-8"))

    as_txt = '\n\n'.join(para_text_list)

    return as_txt
Exemple #36
0
	def convert_docx_to_txt(self,f):
		output = open(outtxt,'w')
		document = docx.opendocx(str(f))

		paratextlist = docx.getdocumenttext(document)
		newparatextlist = []
		for paratext in paratextlist:
			newparatextlist.append(paratext.encode('utf-8'))
		print "Convertion docx to txt is Successfully finished"
		strg = '\n\n'.join(newparatextlist)
		output.write(strg)
		output.close()
Exemple #37
0
def handle_docx(input_path, output_path):
    try:
        document = opendocx(input_path)
        newfile = open(output_path, "w")
    except:
        return "Unable to open the file"

    paratextlist = getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    return "\n".join(newparatextlist)
Exemple #38
0
def count_docx(file_name):
    try:
        document = docx.opendocx(file_name)
    except:
        print('Cannot open file to read.')
        exit()
 
    paratextlist = docx.getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))
    
    return len(re.findall(r'\w+', '\n'.join(newparatextlist)))
Exemple #39
0
 def docx(self, path):
     """
     Method for extracting text data from docx files
     Input: full path to docx file
     Output: extracted text
     """
     try:
         docx = opendocx(os.path.join(DATA_DIR, path))
         text = '\n'.join([page for page in getdocumenttext(docx)])
     except Exception as e:
         self.log.exception(e)
     else:
         return text
Exemple #40
0
def load_corpus(directory):
    texts = {}
    docs = {}
    for f in os.listdir(directory):
        print 'Loading: ', directory + f
        if f.endswith("txt8"):
            with codecs.open(directory + f, 'r', 'ascii', 'ignore') as text:
                texts[f[:-1]] = text.read()
        elif f.endswith('docx'):
            d = docx.clean(docx.opendocx(directory + f))
            # converts to nltk text object
            docs[f] = flatten(docx.getdocumenttext(d))
    return texts, docs
Exemple #41
0
def count_docx(file_name):
    try:
        document = docx.opendocx(file_name)
    except:
        print('Cannot open file to read.')
        exit()

    paratextlist = docx.getdocumenttext(document)
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))

    return len(re.findall(r'\w+', '\n'.join(newparatextlist)))
Exemple #42
0
def _docx_to_txt(file_path, dst_dir):
    """
    Uses the pdftotxt unix util, with --layout option, to convert file_name
    to .txt and save in dst_dir
    """
    file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.docx$', '.txt', file_name))
    doc = opendocx(file_path)
    txt = '\n'.join(getdocumenttext(doc))
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
def file_list():
    vr = {}
    for f in os.listdir(u'C:/Users/eliav/Downloads/Rosh_on_Talmud-2015-03-31/Rosh on Talmud'):
        b=""
        print f
        pf = os.path.join(u'C:/Users/eliav/Downloads/Rosh_on_Talmud-2015-03-31/Rosh on Talmud', f)
        if len(f.split("."))>1 and f.split(".")[1] == "docx":
            document  = docx.opendocx(pf)
            for a in docx.getdocumenttext(document):
              b = b + a
            vr[f.split(".")[0]] = b
    files_list = vr.keys()
    return files_list, vr
Exemple #44
0
 def queryPB(self):
     self.plainTextEdit.clear()
     pwd = os.getcwd()
     address = pwd + "\\source"
     filenames=os.listdir(address)
     for filename in filenames:
         filepath = address+'\\'+filename
         ext = os.path.splitext(filepath)[1]
         if ext == ".docx":
             document = docx.opendocx(filepath)  #打开文件demo.docx
             docx_lines = docx.getdocumenttext(document)
             docx_lines_num = len(docx_lines)
         elif ext == ".md":
             document = open(filepath,'r',encoding = "utf-8")  #打开文件demo.docx
             docx_lines = document.readlines()
             docx_lines_num = len(docx_lines)
             print(docx_lines_num)
         last_head = 0
         last_tail = 0
         for index,line in enumerate(docx_lines):
             if self.lineEdit.text() in line or self.lineEdit.text().capitalize() in line:
                 # print(line)
                 string = []
                 string.append("文件位置:" + filepath + "\n")
                 now_index_head = index
                 now_index_tail = index
                 while True:
                     now_index_head = now_index_head - 1
                     if "来源" in docx_lines[now_index_head]:
                         # print("来源头:" + str(now_index_head))
                         break
                 while True:
                     now_index_tail = now_index_tail + 1
                     if "来源" in docx_lines[now_index_tail]:
                         # print("来源尾:" + str(now_index_tail))
                         break
                     if now_index_tail == (docx_lines_num -1):
                         # print("来源尾1:" + str(now_index_tail))
                         now_index_tail = now_index_tail + 1
                         break
                 if (last_head != now_index_head) or (last_tail != now_index_tail):
                     last_head = now_index_head
                     last_tail = now_index_tail
                     for i in docx_lines[now_index_head:now_index_tail]:
                         #print(i)
                         string.append(i + "\n")
                     #print("\n")
                     string.append("\n")
                     #print("".join(string))
                     # self.plainTextEdit.append("".join(string))
                     self.plainTextEdit.appendPlainText("".join(string))
Exemple #45
0
def _docx_to_txt(file_path, dst_dir, file_name):
    """
    Uses the docx python module to extract text from a docx file and save
    to .txt in dst_dir.
    """
    if file_name is None:
        file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.docx$', '.txt', file_name))
    doc = opendocx(file_path)
    txt = '\n'.join(getdocumenttext(doc))
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
def main():

    if len(sys.argv) != 3:
        print (usage())
        sys.exit(1)

    try:
        infil = opendocx(sys.argv[1])
        outfil = open(sys.argv[2], 'w')
    except Exception:
        # print("Exception: " + repr(e) + "\n")
        sys.exit(1)

    extract_docx_text(infil, outfil)
Exemple #47
0
def main():
	reports = os.listdir( "./data" )
	offices = []
	topics = []
	units = []
	#import pdb
	#pdb.set_trace()
	for report in reports:
		files = os.listdir( "./data/%s" % ( report, ) )
		for file in files:
			print "./data/%s/%s" % (report, file)
			paragraphs = docx.getdocumenttext(docx.opendocx("./data/%s/%s" % (report, file)))
			results = []
			flag = False
			type = -1 #type uninitialized
			
			office_name = get_ref( paragraphs[0].strip(), offices ) #First paragraph is the office
			topic = get_ref( paragraphs[1].strip(), topics ) #Second paragraph is the report topic
			
			#Third paragraph is the inspected units
			#TODO: More intelligent unit extraction
			if u"הגופים המבוקרים:" in paragraphs[2]:
				inspected = paragraphs[2].split(":")[1].strip().split(";")
				new_inspected = []
				for inspectee in inspected:
					inspectee = inspectee.replace(u"\u2014","-").split("-")
					inspectee = [i.strip() for i in inspectee]
					inspectee_name = inspectee[0] if len(inspectee) == 1 else inspectee[1]
					unit = filter( lambda x: x["name"] == inspectee_name, units)
					if len( unit ) == 0:
						office = "" if len( inspectee ) == 1 else get_ref( inspectee[0].strip(), offices )
						units.append( { "name": inspectee_name, "slug": hash( inspectee_name ), "office": office } )
						new_inspected.append( units[-1]["name"] ) #TODO: use slug instead of name
					else:
						new_inspected.append( unit[0]["name"] ) #TODO: use slug instead of name
			
			for paragraph in paragraphs:
				paragraph = paragraph.strip()
				if paragraph in _POI:
					flag = True
					results.append( { "id": 0, "type": 0, "status": 0, "text": "", "followup": "", "link": "", "report": report, "unit": "", "topic": topic.encode("utf-8"), "office": office_name.encode("utf-8") } )
					type = _POI.index(paragraph)
				elif flag:
					if re.match("(\d+)\.?", paragraph):
						results[-1]["id"] = int(re.match("(\d+)\.?", paragraph).groups(1)[0])
					
					results[-1][_POI_TYPES[type]] += re.sub("(\d+)\.?", "", paragraph).encode("utf-8") if len(paragraph) > 1 else ""
			
			#TODO: Properly compute status
			json.dumps(results, open("./results.json", "w+"))
Exemple #48
0
def main():
	files = os.listdir( "./data" )
	result = codecs.open("./result.txt", "w+", "utf-8")
	offices = []
	topics = []
	units = []
	for file in files:
		paragraphs = docx.getdocumenttext(docx.opendocx("./data/%s" % (file, )))
		results = []
		flag = False
		type = -1 #type uninitialized
		
		#TODO: Switch to using plain lists for offices, topics & units
		office_name = get_ref( paragraphs[0].strip(), offices ) #First paragraph is the office
		topic = get_ref( paragraphs[1].strip(), topics ) #Second paragraph is the report topic
		
		#Third paragraph is the inspected units
		#TODO: Insure the paragraph is a list of inspectees, and clean the header
		#TODO: Smarter filtering for units and offices
		inspected = paragraphs[2].strip().split(";")
		new_inspected = []
		for inspectee in inspected:
			inspectee = inspectee.split("-")
			inspectee = [i.strip() for i in inspectee]
			inspectee_name = inspectee[0] if len(inspectee) == 1 else inspectee[1]
			unit = filter( lambda x: x["name"] == inspectee_name, units)
			if len( unit ) == 0:
				office = "" if len( inspectee ) == 1 else get_ref( inspectee[0].strip(), offices )
				units.append( { "name": inspectee_name, "slug": hash( inspectee_name ), "office": office } )
				new_inspected.append( units[-1]["name"] )
			else:
				new_inspected.append( unit[0]["name"] )
		
		for paragraph in paragraphs:
			paragraph = paragraph.strip()
			if paragraph in _POI:
				flag = True
				results.append( { "id": 0, "type": 0, "status": 0, "text": "", "followup": "", "link": "", "report": "", "unit": "", "topic": topic.encode("utf-8"), "office": office_name.encode("utf-8") } )
				type = _POI.index(paragraph)
			elif flag:
				if re.match("(\d+)\.?", paragraph):
					results[-1]["id"] = int(re.match("(\d+)\.?", paragraph).groups(1)[0])
				
				results[-1][_POI_TYPES[type]] += re.sub("(\d+)\.?", "", paragraph).encode("utf-8") if len(paragraph) > 1 else ""
		
		#TODO: Properly compute status
		for entry in results:
			for inspectee in new_inspected:
				update_table.insert_row( (entry["id"], entry["type"], entry["status"], entry["text"], entry["followup"], entry["link"], entry["report"], inspectee.encode("utf-8"), entry["topic"], entry["office"]) )
Exemple #49
0
def document_to_text(filename, file_path):
    if filename[-4:] == ".doc":
        cmd = ['antiword', file_path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif filename[-5:] == ".docx":
        document = opendocx(file_path)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
        return '\n\n'.join(newparatextlist)
    elif filename[-4:] == ".pdf":
        return convert_pdf_to_txt(file_path)
def convDocx(portfolios_dir, f):
    docx_filepath = os.path.join(portfolios_dir, f)
    docu = opendocx(docx_filepath)
    paratextlist = getdocumenttext(docu)
    
    doc_dict = {'doctype': "report", 'origin': "shareworks"}
    doc_dict['_id'] = f
    doc_dict['student_email'] = email_from_fname(f)
    doc_dict['content'] = []
    for pair in pairwise(paratextlist):
        if len(pair[0]) / len(pair[1]) < 1.5: #assume header is not much longer than text
            doc_dict['content'].append({'header': pair[0], 'text': "\n\n".join(pair)})
    if len(doc_dict['content']) == 0:
        doc_dict['content'] = [{'text': 'FrozenCutlery', 'txt_ann': None}]
    
    return doc_dict
def docxExtract(docxfile):
    try:
        document = opendocx(docxfile)
    except:
        print "Error opening docx"
        exit()

    # Fetch all the text out of the document we just created
    paratextlist = getdocumenttext(document)

    # Make explicit unicode version
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))

    # Print out text of document with two newlines under each paragraph
    return '\n'.join(newparatextlist)
Exemple #52
0
def readdocx():
    '''Microsoft document reader'''
    did = request.vars.did
    highlight = request.vars.h
    if not highlight:
        highlight = "VixenServer"
    fdb = db(db.vfile.uuid == did).select().last()
    upfolder = '%sstatic/uploads' % request.folder
    frpath = '%s/%s' % (upfolder, fdb.name)
    fpath = os.path.abspath(frpath)
    if fdb and fdb.ext == 'txt':
        if os.path.isfile(fpath):
            data = open(fpath, 'r').read()
            data = '<p style="padding:5px;">{}<p'.format(data)
            return data
    if fdb and fdb.ext == 'docx':
        if os.path.isfile(fpath):
            document = docx.opendocx(fpath)
            paratextlist = docx.getdocumenttext(document)
            newparatextlist = []
            for paratext in paratextlist:
                newparatextlist.append(paratext.encode("utf-8"))

            asc = string.ascii_lowercase
            lang = 'en'
            #print newparatextlist[0][0]
            if newparatextlist and not newparatextlist[0][0].lower() in asc:
                lang = 'fa'
            data = '<br/>'.join(newparatextlist).replace(
                highlight,
                '<b style="background:yellow;color:darkgreen">{}</b>'.format(
                    highlight))
            data = '<div style="padding:5px;">{}</div>'.format(data)
            response.title = 'Document: %s' % fdb.rawname
            if not request.vars.raw == 'true':
                return dict(name=fdb.rawname, data=XML(data), lang=lang)
            else:
                if lang == 'fa':
                    style = "padding:2px;background:#fff;direction:rtl;font-family:terafik;font-size:10px;"
                else:
                    style = "padding:2px;background:#fff;font-family:terafik;font-size:10px;"
                return '<div style="%s">%s</div>' % (style, XML(data))

    else:
        session.flash = 'Error reading document!'
        redirect(request.env.http_referer)