def encodestdio(encodings=None, errors=None): """ After this function is called, Unicode strings written to stdout/stderr are automatically encoded and strings read from stdin automatically decoded with the given encodings and error handling. encodings and errors can be a dict with mappings for stdin/stdout/stderr, e.g. encodings={'stdin': 'UTF-8', 'stdout': 'UTF-8', 'stderr': 'UTF-8'} or errors={'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'} In the case of errors, stdin uses a default 'strict' error handling and stdout/stderr both use 'replace'. """ if not encodings: encodings = {'stdin': None, 'stdout': None, 'stderr': None} if not errors: errors = {'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'} for stream_name in set(encodings.keys() + errors.keys()): stream = getattr(sys, stream_name) encoding = encodings.get(stream_name) if not encoding: encoding = get_encoding(stream) error_handling = errors.get(stream_name, 'strict') if isinstance(stream, EncodedStream): stream.encoding = encoding stream.errors = error_handling else: setattr(sys, stream_name, EncodedStream(stream, encoding, error_handling))
def test_1(self): soup = BeautifulSoup('''<html><head> <title>第7424章_校花的贴身高手_八一中文网</title> <meta http-equiv="content-type" content="text/html; charset=gbk" /> <meta http-equiv="Cache-Control" content="no-siteapp" /> <meta http-equiv="Cache-Control" content="no-transform" /> <meta http-equiv="mobile-agent" content="format=html5; url=http://m.81xsw.com/book/169/10088024.html" /> <meta http-equiv="mobile-agent" content="format=xhtml; url=http://m.81xsw.com/book/169/10088024.html" /> <meta name="keywords" content="校花的贴身高手,第7424章" /> <meta name="description" content="八一中文网提供了鱼人二代创作的都市言情《校花的贴身高手》干净清爽无错字的文字章节:第7424章在线阅读。" /> <link rel="stylesheet" href="/css/xiaoshuo.css" /> <script type="text/javascript" src="http://libs.baidu.com/jquery/1.4.2/jquery.min.js"></script> <script type="text/javascript" src="/js/bqg.js"></script> <script type="text/javascript" src="/js/cookies.js"></script> <script type="text/javascript"> var mobileAgent = new Array("iphone", "ipod", "ipad", "android", "mobile", "blackberry", "webos", "incognito", "webmate", "bada", "nokia", "lg", "ucweb", "skyfire"); var browser = navigator.userAgent.toLowerCase(); var isMobile = false; for (var i=0; i<mobileAgent.length; i++){ if (browser.indexOf(mobileAgent[i])!=-1){ isMobile = true; //alert(mobileAgent[i]); location.href = 'http://m.81xsw.com/book/169/10088024.html'; break; } } </script> </head> </html>''') self.assertEqual('gbk', get_encoding(soup))
def encodestdio(encodings=None, errors=None): """ After this function is called, Unicode strings written to stdout/stderr are automatically encoded and strings read from stdin automatically decoded with the given encodings and error handling. encodings and errors can be a dict with mappings for stdin/stdout/stderr, e.g. encodings={'stdin': 'UTF-8', 'stdout': 'UTF-8', 'stderr': 'UTF-8'} or errors={'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'} In the case of errors, stdin uses a default 'strict' error handling and stdout/stderr both use 'replace'. """ if not encodings: encodings = {'stdin': None, 'stdout': None, 'stderr': None} if not errors: errors = {'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'} for stream_name in set(list(encodings.keys()) + list(errors.keys())): stream = getattr(sys, stream_name) encoding = encodings.get(stream_name) if not encoding: encoding = get_encoding(stream) error_handling = errors.get(stream_name, 'strict') if isinstance(stream, EncodedStream): stream.encoding = encoding stream.errors = error_handling else: setattr(sys, stream_name, EncodedStream(stream, encoding, error_handling))
def __init__(self, pad=False, padchar=" ", sep=" ", end="\n", file_=sys.stdout, fn=None, encoding=None): """ Write safely, avoiding any UnicodeDe-/EncodingErrors on strings and converting all other objects to safe string representations. sprint = SafePrinter(pad=False, padchar=' ', sep=' ', end='\\n', file=sys.stdout, fn=None) sprint(value, ..., pad=False, padchar=' ', sep=' ', end='\\n', file=sys.stdout, fn=None) Writes the values to a stream (default sys.stdout), honoring its encoding and replacing characters not present in the encoding with question marks silently. Optional keyword arguments: pad: pad the lines to n chars, or os.getenv('COLUMNS') if True. padchar: character to use for padding, default a space. sep: string inserted between values, default a space. end: string appended after the last value, default a newline. file: a file-like object (stream); defaults to the sys.stdout. fn: a function to execute instead of printing. """ self.pad = pad self.padchar = padchar self.sep = sep self.end = end self.file = file_ self.fn = fn self.encoding = encoding or (get_encoding(file_) if file_ else None)
def __init__(self, input, **options): """Generate the document :param input: string of the html content. kwargs: - attributes: - debug: output debug messages - min_text_length: - retry_length: - url: will allow adjusting links to be absolute """ if isinstance(input, unicode): self.input = input else: enc = get_encoding(input) self.input = input.decode(enc, u'ignore') self.input = merge_space(self.input) self.options = options self.html = None #self.post_title = None #self.pub_date = None #self.author = None self.trans_html = None self.trans_flag = False
def build_doc(page): if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) page_unicode = page.decode(enc, 'replace') doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc
def build_doc(page): if isinstance(page, unicode): enc = None page_unicode = page else: enc = get_encoding(page) or 'utf-8' page_unicode = page.decode(enc, 'replace') doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc, enc
def get_next_page_url(url,soup): next_page = '' logging.debug("Trying to get next page link from soup") #r = requests.get(url) #soup = BeautifulSoup(r.text,'lxml') encoding = get_encoding(soup) urls = soup.find_all('a', href=True) for u in urls: if u.text.strip() == '下一章' or u.text == '下一页'.strip(): next_page = urljoin(url,u['href']) logging.debug("Getting next page url = " + next_page) break return next_page;
def build_doc(page): """Requires that the `page` not be None""" if page is None: LOG.error("Page content is None, can't build_doc") return '' if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) page_unicode = page.decode(enc, 'replace') doc = document_fromstring( page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc
def build_doc(page): if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) # remove any BOM from UTF-8 data if page[:3] == codecs.BOM_UTF8: page = page[3:] if enc != None : page_unicode = page.decode(enc, 'replace') else: # what to do? Should we fall back to chardet ? page_unicode = page doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc
def build_doc(page): if isinstance(page, unicode): page_unicode = page else: enc = get_encoding(page) # remove any BOM from UTF-8 data if page[:3] == codecs.BOM_UTF8: page = page[3:] if enc != None : page_unicode = page.decode(enc, 'replace') else: # what to do? Should we fall back to chardet ? page_unicode = page doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) remove_elements(doc) return doc
def perform_inference(plate_image, model): ''' Performs inference on an input image, given a model. ''' # Create a Network for using the Inference Engine inference_network = Network() # Load the model in the network, and obtain its input shape h = 24 w = 94 eval = inference_network.load_model(model) print('input_shapes:', eval) # Read the input image image = plate_image.copy() # Preprocess the input image preprocessed_image = preprocessing(image, h, w) print('done with preprocessing...') # Create the second input for the lp_recognition model # of the form [0, 1, ..., 1] of shape (88, 1) seq_ind = np.ones(88) seq_ind[0] = 0 seq_ind = seq_ind.reshape(88, 1) # Perform synchronous inference on the image inference_network.sync_inference(preprocessed_image, seq_ind) # Obtain the output of the inference request gen_output = inference_network.extract_output() output = gen_output['decode'] # Obtain the decoded output lp_codes = get_encoding() numbers_list = decode_output(output, lp_codes) print('Recognized License plate number is: ') fin_out = '' for item in numbers_list: fin_out += str(item) return fin_out
def test_2(self): soup = BeautifulSoup('''<html><Head> <meta http-equiv="Cache-Control" content="no-siteapp" /> <meta http-equiv="Cache-Control" content="no-transform" /> <meta http-equiv="mobile-agent" content="format=html5; url=http://m.liushuba.com/files/article/html/60/60186/23554540.html" /> <meta http-equiv="mobile-agent" content="format=xhtml; url=http://m.liushuba.com/files/article/html/60/60186/23554540.html" /> <meta http-equiv="Content-Type" content="text/html; charset=gbk" /> <title>修真聊天群最新章节- 第2431章 您好,您的半身已关机-55小说网</title> <meta http-equiv="Content-Type" content="text/html; charset=gbk" /> <meta name="keywords" content="修真聊天群最新章节,小说修真聊天群TXT下载,修真聊天群全文阅读,圣骑士的传说作品" /> <meta name="description" content="《修真聊天群》的最新章节《 第2431章 您好,您的半身已关机》无弹窗广告" /> <meta name="author" content="圣骑士的传说" /> <Link rel="stylesheet" href="/heibing/css/style.css" type="text/css"/> <script src="/js/tz.js" type="text/javascript"></script> <script type="text/javascript">uaredirect("http://m.liushuba.com/files/article/html/60/60186/23554540.html");</script> <script language="javascript" type="text/javascript" src="/heibing/js/xiaoshuo.js"></script> <script type="text/javascript">var preview_page = "23552272.html",next_page = "23554733.html",index_page = "index.html",article_id = "60186",chapter_id = "23554540";function jumpPage(event){var evt =event?event:window.event;if(evt.keyCode==37) location=preview_page;if (evt.keyCode==39) location=next_page;if (evt.keyCode==13) location=index_page;}document.onkeydown=jumpPage;</script> </Head> </html>''') self.assertEqual('gbk', get_encoding(soup))
def build_doc(page): enc = get_encoding(page) page_enc = page.decode(enc, 'replace').encode('utf-8') doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) return doc
q = Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr, shell=shell, cwd=cwd, **kwargs) else: assert _MSWINDOWS==True, 'invalid platform' if shell is None: shell = True # win32 don't have os.execvp() so have to run command in a shell q = Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr, shell=shell, cwd=cwd, **kwargs) except OSError, e: raise CommandError(list_args, status=e.args[0], stderr=e) stdout,stderr = q.communicate(input=stdin) status = q.wait() if unicode_output == True: if encoding == None: encoding = get_encoding() if stdout != None: stdout = unicode(stdout, encoding) if stderr != None: stderr = unicode(stderr, encoding) if verbose == True: print >> sys.stderr, '%d\n%s%s' % (status, stdout, stderr) if status not in expect: raise CommandError(list_args, status, stdout, stderr) return status, stdout, stderr class Pipe (object): """Simple interface for executing POSIX-style pipes. Based on the `subprocess` module. The only complication is the adaptation of `subprocess.Popen._communicate` to listen to the
def __init__(self,settings_path="settings.json", run_settings="run_settings/default.json",training_set_mode="train", train_or_predict="train", verbose=False, force=False, split=1): # parse the settings file self.settings = neukrill_net.utils.Settings(settings_path) # get the run settings if train_or_predict == 'test': force=True self.run_settings = neukrill_net.utils.load_run_settings(run_settings, self.settings, force=force) processing_settings = self.run_settings["preprocessing"] # get a processing function from this processing = neukrill_net.augment.augmentation_wrapper( **processing_settings) # super simple if statements for predict/train if train_or_predict == "train": # split the dataset based on training_set_mode option: self.settings.image_fnames[train_or_predict] = \ neukrill_net.utils.train_test_split( self.settings.image_fnames, training_set_mode, train_split=self.run_settings["train_split"]) # count the images self.N_images = sum(1 for class_label in self.settings.classes for image_path in self.settings.image_fnames[train_or_predict][class_label]) # multiply that count by augmentation factor self.N_images = int(self.N_images* self.run_settings["augmentation_factor"]) # initialise y vector y = [] # initialise array X = np.zeros((self.N_images,self.run_settings["final_shape"][0], self.run_settings["final_shape"][1],1)) image_index = 0 # load the images in image_fpaths, iterating and keeping track of class if self.run_settings.get("use_super_classes", False): # create dictionary to cache superclass vectors supclass_vecs = {} # get the general hierarchy general_hier = enc.get_hierarchy(self.settings) for class_label in self.settings.classes: for image_path in self.settings.image_fnames[ train_or_predict][class_label]: # load the image as numpy array image = skimage.io.imread(image_path) # apply processing function (get back multiple images) images = processing(image) # for each image store a class label if self.run_settings.get("use_super_classes", False): # check if superclass vector for this class label # already generated, if not generate if not supclass_vecs.has_key(class_label): # get superclass hierarchy for class label supclass_hier = enc.get_encoding(class_label, general_hier) # collapse to a list of 1/0 values supclass_vecs[class_label] = \ [el for grp in supclass_hier for el in grp] y += [supclass_vecs[class_label]]*len(images) else: y += [class_label]*len(images) # then broadcast each of these images into the empty X array for image in images: X[image_index,:,:,0] = image image_index += 1 # if we're normalising if processing_settings.get("normalise", False): if verbose: print("Applying normalisation: {0}".format( processing_settings["normalise"]["global_or_pixel"])) # then call the normalise function X,self.run_settings = neukrill_net.image_processing.normalise(X, self.run_settings, verbose=verbose) # make sure y is an array y = np.array(y) if self.run_settings.get("use_super_classes", False): # using superclasses so y already contains target vectors super(self.__class__,self).__init__(topo_view=X,y=y) else: # not using superclasses so map label strings to integers # count the y labels N_y_labels = len(list(set(y))) # build dictionary to encode labels numerically class_dictionary = {} for i,c in enumerate(self.settings.classes): class_dictionary[c] = i # map to integers y = np.array(map(lambda c: class_dictionary[c], y)) # make it 2D column vector y = y[np.newaxis].T # now run inherited initialisation super(self.__class__,self).__init__(topo_view=X,y=y,y_labels=N_y_labels) elif train_or_predict == "test": # split test paths if we're splitting them self.settings.image_fnames = neukrill_net.utils.test_split(split, self.settings.image_fnames) # test is just a big list of image paths # how many? self.N_images = sum(1 for image_path in self.settings.image_fnames[train_or_predict]) # check augmentation in the traditional way (it's boilerplate time) self.N_images = int(self.N_images* self.run_settings["augmentation_factor"]) # more boilerplate code, but it's going to be easier than making a # function that can deal with the above as well # initialise array #import pdb #pdb.set_trace() X = np.zeros((self.N_images,self.run_settings["final_shape"][0], self.run_settings["final_shape"][1],1)) image_index = 0 if verbose: print("Loading this many images:...........................") # get a list of 50 image_paths to watch out for stepsize = int(len(self.settings.image_fnames[train_or_predict])/50) progress_paths = [impath for i,impath in enumerate(self.settings.image_fnames[train_or_predict]) if i%stepsize == 0 ] # loop over all the images, in order for image_path in self.settings.image_fnames[train_or_predict]: if verbose: if image_path in progress_paths: sys.stdout.write(".") sys.stdout.flush() # if it's the last one we better stick a newline on if image_path == progress_paths[-1]: sys.stdout.write(".\n") # load the image as numpy array image = skimage.io.imread(image_path) # apply processing function (get back multiple images) images = processing(image) # then broadcast each of these images into the empty X array for image in images: X[image_index,:,:,0] = image image_index += 1 # if we're normalising if processing_settings.get("normalise",0): if verbose: print("Applying normalisation: {0}".format( processing_settings["normalise"]["global_or_pixel"])) # then call the normalise function X,self.run_settings = neukrill_net.image_processing.normalise(X, self.run_settings, verbose=verbose) # store the names in this dataset object self.names = [os.path.basename(fpath) for fpath in self.settings.image_fnames[train_or_predict]] # now run inherited initialisation super(self.__class__,self).__init__(topo_view=X) else: raise ValueError('Invalid option: should be either "train" for' 'training or "test" for prediction (I know ' ' that is annoying).')
shell = True # win32 don't have os.execvp() so have to run command in a shell q = Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr, shell=shell, cwd=cwd, **kwargs) except OSError, e: raise CommandError(list_args, status=e.args[0], stderr=e) stdout, stderr = q.communicate(input=stdin) status = q.wait() if unicode_output == True: if encoding == None: encoding = get_encoding() if stdout != None: stdout = unicode(stdout, encoding) if stderr != None: stderr = unicode(stderr, encoding) libbe.LOG.debug(u'{0}\n{1}{2}'.format(status, stdout, stderr)) else: libbe.LOG.debug('{0}\n{1}{2}'.format(status, stdout, stderr)) if status not in expect: raise CommandError(list_args, status, stdout, stderr) return status, stdout, stderr if libbe.TESTING == True: suite = doctest.DocTestSuite()