def tilde_decode(s): x = BytesIO(s).read().decode('utf-8') assert x[0] == '#' x = x[1:] x = untilde(x.replace('tilde', 'xilde')) x = '# ' + x.replace('xilde', 'tilde') return x, len(x)
def tilde_decode(s): x = BytesIO(s).read().decode("utf-8") assert x[0] == "#" x = x[1:] x = untilde(x.replace("tilde", "xilde")) x = "# " + x.replace("xilde", "tilde") return x, len(x)
def predict(): operation = BytesIO(base64.urlsafe_b64decode(request.form['operation'])) CNN = ConvolutionalNeuralNetwork() operation = CNN.predict(operation) n_operation = operation.replace('x', '*') count = n_operation.count('√') while (count > 0): pos = n_operation.find('√') if pos != -1: sqrt_buff = '' i = 1 while (pos + i < len(n_operation) and n_operation[pos + i].isdigit()): sqrt_buff += n_operation[pos + i] i = i + 1 root = math.sqrt(int(sqrt_buff)) n_operation = n_operation.replace('√' + sqrt_buff, str(root)) count = count - 1 return json.dumps({ 'operation': operation, #'solution': calculate_operation(operation) 'solution': eval(n_operation) })
def writer(row, last): if isscalar(row) or row.ndim == 0: outfile.write(startindent + ' ' + str(row.filled().astype(ndarray)) + ';\n') return tmpstr = StringIO() if ma.getmaskarray(row).all(): tmpstr.write(b', '.join([b'_'] * row.size) + b', ') else: savetxt(tmpstr, ma.filled(row), fmt, delimiter=commaspace, newline=commaspace) if last: tmpstr.seek(-2, 1) tmpstr.write(semicolon) tmpstr.seek(0, 0) tmpstr = tmpstr.read().decode('ASCII') tmpstr = tmpstr.replace( fmt % getattr(row, 'fill_value', 0) + ',', '_,') tmpstr = textwrap.fill( tmpstr, line_length, initial_indent=startindent + ' ', subsequent_indent=startindent + ' ') try: outfile.write(tmpstr) outfile.write('\n') except Exception as e: exception_handler(e, outfile)
def ungzip_files(filename, where_to=None, fLOG=noLOG, fvalid=None, remove_space=True, unzip=True): """ decompress files from a gzip file @param filename final gzip file (double compression, extension should something like .zip.gz) @param where_to destination folder (can be None, the result is a list of tuple) @param fLOG logging function @param fvalid function which takes two paths (zip name, local name) and return True if the file must be unzipped, False otherwise, if None, the default answer is True @param remove_space remove spaces in created local path (+ ``',()``) @param unzip unzip file after gzip @return list of unzipped files .. versionadded:: 1.4 """ if sys.version_info[0] == 2: if isinstance(filename, bytearray): filename = BytesIO(filename) else: if isinstance(filename, bytes): filename = BytesIO(filename) f = gzip.open(filename, 'rb') content = f.read() f.close() if unzip: return unzip_files(content, where_to=where_to, fLOG=fLOG) else: filename = filename.replace(".gz", "") with open(filename, "wb") as f: f.write(content) return filename
def ungzip_files(filename, where_to=None, fLOG=noLOG, fvalid=None, remove_space=True, unzip=True, encoding=None): """ Uncompresses files from a gzip file. @param filename final gzip file (double compression, extension should something like .zip.gz) @param where_to destination folder (can be None, the result is a list of tuple) @param fLOG logging function @param fvalid function which takes two paths (zip name, local name) and return True if the file must be unzipped, False otherwise, if None, the default answer is True @param remove_space remove spaces in created local path (+ ``',()``) @param unzip unzip file after gzip @param encoding encoding @return list of unzipped files """ if isinstance(filename, bytes): is_file = False filename = BytesIO(filename) else: is_file = True if encoding is None: f = gzip.open(filename, 'rb') content = f.read() f.close() if unzip: try: return unzip_files(content, where_to=where_to, fLOG=fLOG) except Exception as e: # pragma: no cover raise IOError( "Unable to unzip file '{0}'".format(filename)) from e elif where_to is not None: filename = os.path.split(filename)[-1].replace(".gz", "") filename = os.path.join(where_to, filename) with open(filename, "wb") as f: f.write(content) return filename return content else: f = gzip.open(filename, 'rt', encoding="utf-8") content = f.read() f.close() if is_file: filename = filename.replace(".gz", "") with open(filename, "wb") as f: f.write(content) return filename return content
def writer(row, last): if isscalar(row) or row.ndim == 0: outfile.write(startindent + ' ' + str(row.filled().astype(ndarray)) + ';\n') return #old = get_printoptions() #set_printoptions(threshold = inf, linewidth = line_length) #tmpstr = startindent + ' ' + array2string(row, separator = commaspace, formatter = funcs).replace('\n', '\n' + startindent + ' ')[1:-1].replace('--', '_') #if last: # tmpstr += ';' #else: # tmpstr += commaspace #set_printoptions(**old) #outfile.write(tmpstr) #outfile.write('\n') tmpstr = StringIO() if ma.getmaskarray(row).all(): tmpstr.write(b', '.join([b'_'] * row.size) + b', ') else: savetxt(tmpstr, ma.filled(row), fmt, delimiter=commaspace, newline=commaspace) if last: tmpstr.seek(-2, 1) tmpstr.write(semicolon) tmpstr.seek(0, 0) tmpstr = tmpstr.read().decode('ASCII') tmpstr = tmpstr.replace( fmt % getattr(row, 'fill_value', 0) + ',', '_,') tmpstr = textwrap.fill( tmpstr, line_length, initial_indent=startindent + ' ', subsequent_indent=startindent + ' ') try: outfile.write(tmpstr) outfile.write('\n') except Exception as e: exception_handler(e, outfile)
''' BDS_unpack_mod = bytes( r'(\*\*\*\\n\", n\);\n)[\t ]+exit\(8\);\n[\t ]+for'.encode('ascii')) if 'build_ext' in sys.argv and not os.path.exists(here + '/src/wgrib.c'): print('Downloading wgrib source code...') request = urllib.request.urlopen(wgrib_url) with open(here + '/src/wgrib.c', 'wb') as wgrib_src: src = BytesIO(request.read().replace( c_main, define + c_main.replace(b'main', b'GRIB_MAIN'))) try: src = re.sub(BDS_unpack_mod, b'\\1\treturn;\tfor', src.getvalue()) except: src = re.sub(bytes(BDS_unpack_mod.encode('ascii')), b'\\1\treturn;\tfor', src.getvalue()) src = src.replace(b'exit(', b'return(') wgrib_src.write(src) if 'build_ext' in sys.argv and not isWindows() and not os.path.isdir( here + '/src/grib2'): tarfilepath = os.path.join(here, 'src', os.path.basename(wgrib2_url)) if not os.path.exists(tarfilepath): print('Downloading wgrib2 source code...') request = urllib.request.urlopen(wgrib2_url) with open(tarfilepath, 'wb') as tgz: tgz.write(request.read()) print('Extracting src/{}...'.format(os.path.basename(tarfilepath))) with tarfile.open(tarfilepath, mode='r:gz') as tgz: for name in tgz.getnames(): print('extracting src/{}'.format(name))
def pdf(self, fp, csv_row): password = '' extracted_text = '' self.parser = PDFParser(fp) self.document_t = PDFDocument pf = PdfFileReader # isEncrypted try: i = 0 try: thread = Thread(target=self.load_pdf, args=(PDFDocument, password)) thread.start() thread.join(timeout=90) except Exception as e: print('PDF I/O error: ' + e.__str__()) row = [ self.line_count, 'PDF DOCUMENT OBJECT FAILED TO LOAD - ' + e.__str__() + ': ' + self.url, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ] # self.line_count += 1 report_path = self.report_folder + self.report_name # 90 SECONDS or LOAD FAIL with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) stop_event.set() document = PDFDocument document = self.document_t pf = PdfFileReader(BytesIO(open(self.pdf_path, 'rb').read())) # ENCRYPTION if self.parser.doc.encryption is not None: csv_row.insert(4, [self.csv_header[4], 'ENCRYPTED']) csv_row.insert(5, [self.csv_header[5], 'ENCRYPTED']) else: csv_row.insert(4, [self.csv_header[4], 'FALSE']) csv_row.insert(5, [self.csv_header[5], 'NA']) except Exception as e: csv_row.insert(4, [self.csv_header[4], 'FAILED: ' + e.__str__()]) csv_row.insert(5, [self.csv_header[5], 'NA']) exit_call = e.__str__() + ' document failed!!' print(exit_call) pass page_count = 0 # istagged try: pages = PDFPage.get_pages(document) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() page_no = 0 istagged = 'FALSE' try: # document.catalog if document.catalog['MarkInfo']: istagged = 'TRUE' except Exception as e: exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) page_count = resolve1(document.catalog['Pages'])['Count'] csv_row.insert(6, [self.csv_header[6], istagged]) csv_row.insert(7, [self.csv_header[7], page_count]) except Exception as e: csv_row.insert(6, [self.csv_header[6], 'IsTagged: ' + e.__str__()]) csv_row.insert(7, [self.csv_header[7], 'Page Count: ' + e.__str__()]) exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) # TOC try: if pf.outlines: csv_row.insert(8, [self.csv_header[8], 'TRUE']) '''pdf_path_toc = self.document_folder + pdf_name + '_toc.txt' places_list = pf.outlines with open(pdf_path_toc, 'w') as filehandle: filehandle.writelines("%s\n" % place for place in places_list) filehandle.close()''' else: csv_row.insert(8, [self.csv_header[8], 'FALSE']) except Exception as e: csv_row.insert(8, [self.csv_header[8], 'TOC FAILED: ' + e.__str__()]) exit_call = e.__str__() + ' toc info failed!!' print(exit_call) # isForm, fields, try: if pf.getFields(): csv_row.insert(9, [self.csv_header[9], 'TRUE']) csv_row.insert(10, [self.csv_header[10], pf.getFields().__len__()]) else: csv_row.insert(9, [self.csv_header[9], 'FALSE']) csv_row.insert(10, [self.csv_header[10], 0]) except Exception as e: csv_row.insert(9, [self.csv_header[9], 'FORMS: ' + e.__str__()]) csv_row.insert(10, [self.csv_header[10], 'FIELDS: ' + e.__str__()]) exit_call = e.__str__() + ' forms failed!!' print(exit_call) # tables csv_row.insert(11, [self.csv_header[11], 'NOT RUN']) write_clip = '' word_count = 0 words_per_page = 0 char_count = 0 chars_per_word = 0 image_count = 0 # TODO: write 3 page sample and word count try: if pf.getNumPages() < 50: for page in range(pf.getNumPages()): p = pf.getPage(page) text_clip = p.extractText().encode('UTF-8') text_clip = BytesIO(text_clip).read().__str__()[2:] count_clip = re.findall(r"[^\W_]+", text_clip, re.MULTILINE) word_count += len(count_clip) char_count += len(text_clip) if page <= 3: write_clip += '[ PAGE ' + (page + 1).__str__() + ' START ] ' write_clip += text_clip.replace('\n', '').replace( ',', ' ').replace('"', '') write_clip += '[ PAGE ' + (page + 1).__str__() + ' END ]' else: write_clip = 'OVER 50 PAGES - SAMPLE SKIPPED' except Exception as e: exit_call = e.__str__() + ' :: TEXT sample failed!!' write_clip = exit_call word_count = exit_call char_count = exit_call print(exit_call) # TODO: Words/chars per page try: if not word_count == 0: chars_per_word = char_count / word_count else: chars_per_word = 0 if not page_count == 0: words_per_page = word_count / page_count else: words_per_page = 0 except Exception as e: exit_call = e.__str__() + ' :: WORD METRICS failed!!' chars_per_word = exit_call words_per_page = exit_call print(exit_call) # TODO: Add to row i = 12 try: csv_row.insert(i, [self.csv_header[i], word_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WORD_COUNT: ' + e.__str__()]) i = 13 try: csv_row.insert(i, [self.csv_header[i], char_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CHAR_COUNT: ' + e.__str__()]) i = 14 try: csv_row.insert(i, [self.csv_header[i], words_per_page.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WPP: ' + e.__str__()]) i = 15 try: csv_row.insert(i, [self.csv_header[i], chars_per_word.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CPP: ' + e.__str__()]) # TODO: IMAGES i = 16 '''try: pdfImages = Globals.base_folder + 'cli-tools\\pdfimages.exe' img_folder = self.document_folder + 'images\\' # + pdf_name[:-4] + '\\' if not os.path.exists(img_folder): os.makedirs(img_folder) # cmd = pdfImages + ' -list ' + '\"' + pdf_path + '\"' # output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\n') # save images to disk cmd = pdfImages + ' -list \"' + self.pdf_path + '\" \"' + ' ' + '\"' # subprocess.Popen(cmd, stdout=subprocess.PIPE) os.chdir(img_folder) image_list = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\r\n') # os.remove(img_folder) # image_count = output.count('\n') image_count = image_list.__len__() if image_count > 2: # target = open(pdf_path_image, 'w') # target.write(image_list) # target.close() csv_row.insert(i, [self.csv_header[i], (image_count - 2).__str__()]) elif image_count == 0: csv_row.insert(i, [self.csv_header[i], 0]) else: csv_row.insert(i, [self.csv_header[i], 0]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__() + ' image info failed!!']) exit_call = e.__str__() + ' image info failed!!' print(exit_call)''' # TODO: IMAGES per page i = 17 percent_img_per_page = float try: if not image_count == 0 or page_count == 0: percent_img_per_page = (float(image_count) / float(page_count)) * 100 else: percent_img_per_page = 0 csv_row.insert(i, [self.csv_header[i], percent_img_per_page]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'IMG: ' + e.__str__()]) # TODO: OCR risk i = 18 try: if words_per_page == 0 or percent_img_per_page > 3000: ocr_risk = 5 elif words_per_page < 15 or percent_img_per_page > 2000: ocr_risk = 4 elif words_per_page < 40 or percent_img_per_page > 1000: ocr_risk = 3 elif words_per_page < 70 or percent_img_per_page > 425: ocr_risk = 2 elif words_per_page < 80 or percent_img_per_page > 200: ocr_risk = 1 else: ocr_risk = 0 csv_row.insert(i, [self.csv_header[i], ocr_risk]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'OCR: ' + e.__str__()]) # author, creator, producer, subject, title, di = pf try: di = pf.documentInfo except Exception as e: exit_call = e.__str__() + ' :: DOCUMENT INFO LOAD failed!!' print(exit_call) # Document info if di: # Author try: i = 19 if di.author: csv_row.insert( i, [self.csv_header[i], di.author.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'AUTHOR: ' + e.__str__()]) exit_call = e.__str__() + ' doc info failed!!' print(exit_call) # Creator try: i = 20 if di.creator: csv_row.insert( i, [self.csv_header[i], di.creator.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CREATOR: ' + e.__str__()]) print(exit_call) print('#5.1') # Producer try: i = 21 if di.producer: csv_row.insert( i, [self.csv_header[i], di.producer.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert( i, [self.csv_header[i], 'PRODUCER: ' + e.__str__()]) print(exit_call) # Subject try: i = 22 if di.subject: csv_row.insert( i, [self.csv_header[i], di.subject.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'SUBJECT: ' + e.__str__()]) print(exit_call) # Title try: i = 23 if di.title: csv_row.insert( i, [self.csv_header[i], di.title.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'TITLE: ' + e.__str__()]) print(exit_call) # Document clip i = 24 try: csv_row.insert(i, [self.csv_header[i], write_clip]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__()]) # Write results row = [] for i in range(csv_row.__len__()): row.append(csv_row[i][1]) report_path = self.report_folder + self.report_name # COPLETE WRITE with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) # csv_file.close() fp.close() os.remove(self.pdf_path) # Log close msg = (' >>>> PDF complete:[' + self.url + '] ' + self.line_count.__str__() + ' ' + (datetime.datetime.now().__str__()[:-7])) print(msg) utils.logline(self.log, msg)