Esempio n. 1
0
def encodestdio(encodings=None, errors=None):
    """ After this function is called, Unicode strings written to 
    stdout/stderr are automatically encoded and strings read from stdin
    automatically decoded with the given encodings and error handling.
    
    encodings and errors can be a dict with mappings for stdin/stdout/stderr, 
    e.g. encodings={'stdin': 'UTF-8', 'stdout': 'UTF-8', 'stderr': 'UTF-8'}
    or errors={'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'}
    
    In the case of errors, stdin uses a default 'strict' error handling and 
    stdout/stderr both use 'replace'.
    """
    if not encodings:
        encodings = {'stdin': None, 'stdout': None, 'stderr': None}
    if not errors:
        errors = {'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'}
    for stream_name in set(encodings.keys() + errors.keys()):
        stream = getattr(sys, stream_name)
        encoding = encodings.get(stream_name)
        if not encoding:
            encoding = get_encoding(stream)
        error_handling = errors.get(stream_name, 'strict')
        if isinstance(stream, EncodedStream):
            stream.encoding = encoding
            stream.errors = error_handling
        else:
            setattr(sys, stream_name, EncodedStream(stream, encoding, 
                                                    error_handling))
Esempio n. 2
0
    def test_1(self):
        soup = BeautifulSoup('''<html><head>
<title>第7424章_校花的贴身高手_八一中文网</title>
<meta http-equiv="content-type" content="text/html; charset=gbk" />
<meta http-equiv="Cache-Control" content="no-siteapp" />
<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="mobile-agent" content="format=html5; url=http://m.81xsw.com/book/169/10088024.html" />
<meta http-equiv="mobile-agent" content="format=xhtml; url=http://m.81xsw.com/book/169/10088024.html" />
<meta name="keywords" content="校花的贴身高手,第7424章" />
<meta name="description" content="八一中文网提供了鱼人二代创作的都市言情《校花的贴身高手》干净清爽无错字的文字章节:第7424章在线阅读。" />
<link rel="stylesheet" href="/css/xiaoshuo.css" />
<script type="text/javascript" src="http://libs.baidu.com/jquery/1.4.2/jquery.min.js"></script>
<script type="text/javascript" src="/js/bqg.js"></script>
<script type="text/javascript" src="/js/cookies.js"></script>
<script type="text/javascript"> 
var mobileAgent = new Array("iphone", "ipod", "ipad", "android", "mobile", "blackberry", "webos", "incognito", "webmate", "bada", "nokia", "lg", "ucweb", "skyfire"); 
var browser = navigator.userAgent.toLowerCase(); 
var isMobile = false; 
for (var i=0; i<mobileAgent.length; i++){ if (browser.indexOf(mobileAgent[i])!=-1){ isMobile = true; 
//alert(mobileAgent[i]); 
location.href = 'http://m.81xsw.com/book/169/10088024.html'; 
break; } } 
</script> 
</head>
</html>''')
        self.assertEqual('gbk', get_encoding(soup))
Esempio n. 3
0
def encodestdio(encodings=None, errors=None):
    """ After this function is called, Unicode strings written to 
    stdout/stderr are automatically encoded and strings read from stdin
    automatically decoded with the given encodings and error handling.
    
    encodings and errors can be a dict with mappings for stdin/stdout/stderr, 
    e.g. encodings={'stdin': 'UTF-8', 'stdout': 'UTF-8', 'stderr': 'UTF-8'}
    or errors={'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'}
    
    In the case of errors, stdin uses a default 'strict' error handling and 
    stdout/stderr both use 'replace'.
    """
    if not encodings:
        encodings = {'stdin': None, 'stdout': None, 'stderr': None}
    if not errors:
        errors = {'stdin': 'strict', 'stdout': 'replace', 'stderr': 'replace'}
    for stream_name in set(list(encodings.keys()) + list(errors.keys())):
        stream = getattr(sys, stream_name)
        encoding = encodings.get(stream_name)
        if not encoding:
            encoding = get_encoding(stream)
        error_handling = errors.get(stream_name, 'strict')
        if isinstance(stream, EncodedStream):
            stream.encoding = encoding
            stream.errors = error_handling
        else:
            setattr(sys, stream_name,
                    EncodedStream(stream, encoding, error_handling))
Esempio n. 4
0
	def __init__(self, pad=False, padchar=" ", sep=" ", end="\n", 
				 file_=sys.stdout, fn=None, encoding=None):
		"""
		Write safely, avoiding any UnicodeDe-/EncodingErrors on strings 
		and converting all other objects to safe string representations.
		
		sprint = SafePrinter(pad=False, padchar=' ', sep=' ', end='\\n', 
							 file=sys.stdout, fn=None)
		sprint(value, ..., pad=False, padchar=' ', sep=' ', end='\\n', 
			   file=sys.stdout, fn=None)
		
		Writes the values to a stream (default sys.stdout), honoring its 
		encoding and replacing characters not present in the encoding with 
		question marks silently.
		
		Optional keyword arguments:
		pad:     pad the lines to n chars, or os.getenv('COLUMNS') if True.
		padchar: character to use for padding, default a space.
		sep:     string inserted between values, default a space.
		end:     string appended after the last value, default a newline.
		file:    a file-like object (stream); defaults to the sys.stdout.
		fn:      a function to execute instead of printing.
		"""
		self.pad = pad
		self.padchar = padchar
		self.sep = sep
		self.end = end
		self.file = file_
		self.fn = fn
		self.encoding = encoding or (get_encoding(file_) if file_ else None)
Esempio n. 5
0
    def __init__(self, input, **options):
        """Generate the document

        :param input: string of the html content.

        kwargs:
            - attributes:
            - debug: output debug messages
            - min_text_length:
            - retry_length:
            - url: will allow adjusting links to be absolute

        """
        if isinstance(input, unicode):
            self.input = input
        else:
            enc = get_encoding(input)
            self.input = input.decode(enc, u'ignore')
        self.input = merge_space(self.input)
        self.options = options
        self.html = None
        #self.post_title = None
        #self.pub_date = None
        #self.author = None
        self.trans_html = None 
        self.trans_flag = False
Esempio n. 6
0
def build_doc(page):
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc
Esempio n. 7
0
def build_doc(page):
    if isinstance(page, unicode):
        enc = None
        page_unicode = page
    else:
        enc = get_encoding(page) or 'utf-8'
        page_unicode = page.decode(enc, 'replace')
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc, enc
Esempio n. 8
0
def get_next_page_url(url,soup): 
    next_page = ''
    logging.debug("Trying to get next page link from soup")
    #r = requests.get(url)
    #soup = BeautifulSoup(r.text,'lxml')
    encoding = get_encoding(soup)
    urls = soup.find_all('a', href=True)
    
    for u in urls:
            if u.text.strip() == '下一章' or u.text == '下一页'.strip():
                next_page = urljoin(url,u['href'])
                logging.debug("Getting next page url = " + next_page)
                break
    return next_page;
Esempio n. 9
0
def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ''
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
    doc = document_fromstring(
        page_unicode.encode('utf-8', 'replace'),
        parser=utf8_parser)
    return doc
Esempio n. 10
0
def build_doc(page):
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        # remove any BOM from UTF-8 data
        if page[:3] == codecs.BOM_UTF8:
            page = page[3:]
        if enc != None :
            page_unicode = page.decode(enc, 'replace')
        else: # what to do? Should we fall back to chardet ?
            page_unicode = page
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc
Esempio n. 11
0
def build_doc(page):
    if isinstance(page, unicode):
        page_unicode = page
    else:
        enc = get_encoding(page)
        # remove any BOM from UTF-8 data
        if page[:3] == codecs.BOM_UTF8:
            page = page[3:]
        if enc != None :
            page_unicode = page.decode(enc, 'replace')
        else: # what to do? Should we fall back to chardet ?
            page_unicode = page
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    remove_elements(doc)
    return doc
Esempio n. 12
0
def perform_inference(plate_image, model):

	'''
	Performs inference on an input image, given a model.
	'''
	# Create a Network for using the Inference Engine
	inference_network = Network()
	# Load the model in the network, and obtain its input shape
	h = 24
	w = 94
	eval = inference_network.load_model(model)
	print('input_shapes:', eval)

	# Read the input image
	image = plate_image.copy()

	# Preprocess the input image
	preprocessed_image = preprocessing(image, h, w)
	print('done with preprocessing...')

	# Create the second input for the lp_recognition model
	# of the form [0, 1, ..., 1] of shape (88, 1)
	seq_ind = np.ones(88)
	seq_ind[0] = 0
	seq_ind = seq_ind.reshape(88, 1)

	# Perform synchronous inference on the image
	inference_network.sync_inference(preprocessed_image, seq_ind)

	# Obtain the output of the inference request
	gen_output = inference_network.extract_output()
	output = gen_output['decode']
	
	# Obtain the decoded output
	lp_codes = get_encoding()
	numbers_list = decode_output(output, lp_codes)
	print('Recognized License plate number is: ')

	fin_out = ''
	for item in numbers_list:
		fin_out += str(item)
	return fin_out
Esempio n. 13
0
    def test_2(self):
        soup = BeautifulSoup('''<html><Head>
<meta http-equiv="Cache-Control" content="no-siteapp" />
<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="mobile-agent" content="format=html5; url=http://m.liushuba.com/files/article/html/60/60186/23554540.html" />
<meta http-equiv="mobile-agent" content="format=xhtml; url=http://m.liushuba.com/files/article/html/60/60186/23554540.html" />
<meta http-equiv="Content-Type" content="text/html; charset=gbk" />
<title>修真聊天群最新章节- 第2431章 您好,您的半身已关机-55小说网</title> 
<meta http-equiv="Content-Type" content="text/html; charset=gbk" />
<meta name="keywords" content="修真聊天群最新章节,小说修真聊天群TXT下载,修真聊天群全文阅读,圣骑士的传说作品" /> 
<meta name="description" content="《修真聊天群》的最新章节《 第2431章 您好,您的半身已关机》无弹窗广告" /> 
<meta name="author" content="圣骑士的传说" />
<Link rel="stylesheet" href="/heibing/css/style.css" type="text/css"/>
<script src="/js/tz.js" type="text/javascript"></script>
<script type="text/javascript">uaredirect("http://m.liushuba.com/files/article/html/60/60186/23554540.html");</script>
<script language="javascript" type="text/javascript" src="/heibing/js/xiaoshuo.js"></script>
<script type="text/javascript">var preview_page = "23552272.html",next_page = "23554733.html",index_page = "index.html",article_id = "60186",chapter_id = "23554540";function jumpPage(event){var evt =event?event:window.event;if(evt.keyCode==37) location=preview_page;if (evt.keyCode==39) location=next_page;if (evt.keyCode==13) location=index_page;}document.onkeydown=jumpPage;</script>
</Head>
</html>''')
        self.assertEqual('gbk', get_encoding(soup))
Esempio n. 14
0
    def __init__(self,
                 pad=False,
                 padchar=" ",
                 sep=" ",
                 end="\n",
                 file_=sys.stdout,
                 fn=None,
                 encoding=None):
        """
		Write safely, avoiding any UnicodeDe-/EncodingErrors on strings 
		and converting all other objects to safe string representations.
		
		sprint = SafePrinter(pad=False, padchar=' ', sep=' ', end='\\n', 
							 file=sys.stdout, fn=None)
		sprint(value, ..., pad=False, padchar=' ', sep=' ', end='\\n', 
			   file=sys.stdout, fn=None)
		
		Writes the values to a stream (default sys.stdout), honoring its 
		encoding and replacing characters not present in the encoding with 
		question marks silently.
		
		Optional keyword arguments:
		pad:     pad the lines to n chars, or os.getenv('COLUMNS') if True.
		padchar: character to use for padding, default a space.
		sep:     string inserted between values, default a space.
		end:     string appended after the last value, default a newline.
		file:    a file-like object (stream); defaults to the sys.stdout.
		fn:      a function to execute instead of printing.
		"""
        self.pad = pad
        self.padchar = padchar
        self.sep = sep
        self.end = end
        self.file = file_
        self.fn = fn
        self.encoding = encoding or (get_encoding(file_) if file_ else None)
Esempio n. 15
0
def build_doc(page):
    enc = get_encoding(page)
    page_enc = page.decode(enc, 'replace').encode('utf-8')
    doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
    return doc
Esempio n. 16
0
            q = Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr,
                      shell=shell, cwd=cwd, **kwargs)
        else:
            assert _MSWINDOWS==True, 'invalid platform'
            if shell is None:
                shell = True
            # win32 don't have os.execvp() so have to run command in a shell
            q = Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr,
                      shell=shell, cwd=cwd, **kwargs)
    except OSError, e:
        raise CommandError(list_args, status=e.args[0], stderr=e)
    stdout,stderr = q.communicate(input=stdin)
    status = q.wait()
    if unicode_output == True:
        if encoding == None:
            encoding = get_encoding()
        if stdout != None:
            stdout = unicode(stdout, encoding)
        if stderr != None:
            stderr = unicode(stderr, encoding)
    if verbose == True:
        print >> sys.stderr, '%d\n%s%s' % (status, stdout, stderr)
    if status not in expect:
        raise CommandError(list_args, status, stdout, stderr)
    return status, stdout, stderr

class Pipe (object):
    """Simple interface for executing POSIX-style pipes.

    Based on the `subprocess` module.  The only complication is the
    adaptation of `subprocess.Popen._communicate` to listen to the
    def __init__(self,settings_path="settings.json",
            run_settings="run_settings/default.json",training_set_mode="train",
            train_or_predict="train", verbose=False, force=False, split=1):
        # parse the settings file
        self.settings = neukrill_net.utils.Settings(settings_path)
        # get the run settings
        if train_or_predict == 'test':
            force=True
        self.run_settings = neukrill_net.utils.load_run_settings(run_settings,
                                                                self.settings,
                                                                force=force)
        processing_settings = self.run_settings["preprocessing"]
        # get a processing function from this
        processing = neukrill_net.augment.augmentation_wrapper(
                                                        **processing_settings)

        # super simple if statements for predict/train
        if train_or_predict == "train":
            # split the dataset based on training_set_mode option:
            self.settings.image_fnames[train_or_predict] = \
                    neukrill_net.utils.train_test_split(
                            self.settings.image_fnames, 
                            training_set_mode, 
                            train_split=self.run_settings["train_split"])

            # count the images
            self.N_images = sum(1 for class_label in self.settings.classes
                    for image_path in 
                    self.settings.image_fnames[train_or_predict][class_label])
            # multiply that count by augmentation factor
            self.N_images = int(self.N_images*
                    self.run_settings["augmentation_factor"])
            # initialise y vector
            y = []
            # initialise array
            X = np.zeros((self.N_images,self.run_settings["final_shape"][0],
                self.run_settings["final_shape"][1],1))
            image_index = 0
            # load the images in image_fpaths, iterating and keeping track of class
            if self.run_settings.get("use_super_classes", False):
                # create dictionary to cache superclass vectors
                supclass_vecs = {}
                # get the general hierarchy
                general_hier = enc.get_hierarchy(self.settings)
            for class_label in self.settings.classes:
                for image_path in self.settings.image_fnames[
                                                    train_or_predict][class_label]:
                    # load the image as numpy array
                    image = skimage.io.imread(image_path)
                    # apply processing function (get back multiple images)
                    images = processing(image)
                    # for each image store a class label
                    if self.run_settings.get("use_super_classes", False):
                        # check if superclass vector for this class label
                        # already generated, if not generate
                        if not supclass_vecs.has_key(class_label):
                            # get superclass hierarchy for class label
                            supclass_hier = enc.get_encoding(class_label, general_hier)
                            # collapse to a list of 1/0 values
                            supclass_vecs[class_label] = \
                                [el for grp in supclass_hier for el in grp]
                        y += [supclass_vecs[class_label]]*len(images)
                    else:
                        y += [class_label]*len(images)
                    # then broadcast each of these images into the empty X array
                    for image in images:
                        X[image_index,:,:,0] = image
                        image_index += 1
            # if we're normalising
            if processing_settings.get("normalise", False):
                if verbose:
                    print("Applying normalisation: {0}".format(
                        processing_settings["normalise"]["global_or_pixel"]))
                # then call the normalise function
                X,self.run_settings = neukrill_net.image_processing.normalise(X,
                                            self.run_settings, verbose=verbose)
            # make sure y is an array
            y = np.array(y)
            if self.run_settings.get("use_super_classes", False):
                # using superclasses so y already contains target vectors
                super(self.__class__,self).__init__(topo_view=X,y=y)
            else:
                # not using superclasses so map label strings to integers
                # count the y labels
                N_y_labels = len(list(set(y)))
                # build dictionary to encode labels numerically
                class_dictionary = {}
                for i,c in enumerate(self.settings.classes):
                    class_dictionary[c] = i
                # map to integers
                y = np.array(map(lambda c: class_dictionary[c], y))
                # make it 2D column vector
                y = y[np.newaxis].T
                # now run inherited initialisation
                super(self.__class__,self).__init__(topo_view=X,y=y,y_labels=N_y_labels)
                

        elif train_or_predict == "test":
            # split test paths if we're splitting them
            self.settings.image_fnames = neukrill_net.utils.test_split(split, 
                    self.settings.image_fnames)

            # test is just a big list of image paths
            # how many?
            self.N_images = sum(1 for image_path in 
                    self.settings.image_fnames[train_or_predict])
            # check augmentation in the traditional way (it's boilerplate time)
            self.N_images = int(self.N_images*
                    self.run_settings["augmentation_factor"])

            # more boilerplate code, but it's going to be easier than making a
            # function that can deal with the above as well
            # initialise array
            #import pdb
            #pdb.set_trace()
            X = np.zeros((self.N_images,self.run_settings["final_shape"][0],
                self.run_settings["final_shape"][1],1))
            image_index = 0
            if verbose:
                print("Loading this many images:...........................")
                # get a list of 50 image_paths to watch out for
                stepsize = int(len(self.settings.image_fnames[train_or_predict])/50)
                progress_paths = [impath for i,impath in 
                        enumerate(self.settings.image_fnames[train_or_predict]) 
                        if i%stepsize == 0 ]
            # loop over all the images, in order
            for image_path in self.settings.image_fnames[train_or_predict]:
                if verbose:
                    if image_path in progress_paths: 
                        sys.stdout.write(".")
                        sys.stdout.flush()
                        # if it's the last one we better stick a newline on
                        if image_path == progress_paths[-1]:
                            sys.stdout.write(".\n")
                # load the image as numpy array
                image = skimage.io.imread(image_path)
                # apply processing function (get back multiple images)
                images = processing(image)
                # then broadcast each of these images into the empty X array
                for image in images:
                    X[image_index,:,:,0] = image
                    image_index += 1
            # if we're normalising
            if processing_settings.get("normalise",0):
                if verbose:
                    print("Applying normalisation: {0}".format(
                        processing_settings["normalise"]["global_or_pixel"]))
                # then call the normalise function
                X,self.run_settings = neukrill_net.image_processing.normalise(X,
                                            self.run_settings, verbose=verbose)
            # store the names in this dataset object
            self.names = [os.path.basename(fpath) for fpath in 
                    self.settings.image_fnames[train_or_predict]]

            # now run inherited initialisation
            super(self.__class__,self).__init__(topo_view=X)
        else:
            raise ValueError('Invalid option: should be either "train" for'
                             'training or "test" for prediction (I know '
                             ' that is annoying).')
Esempio n. 18
0
def build_doc(page):
    enc = get_encoding(page)
    page_enc = page.decode(enc, 'replace').encode('utf-8')
    doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser)
    return doc
Esempio n. 19
0
                shell = True
            # win32 don't have os.execvp() so have to run command in a shell
            q = Popen(args,
                      stdin=PIPE,
                      stdout=stdout,
                      stderr=stderr,
                      shell=shell,
                      cwd=cwd,
                      **kwargs)
    except OSError, e:
        raise CommandError(list_args, status=e.args[0], stderr=e)
    stdout, stderr = q.communicate(input=stdin)
    status = q.wait()
    if unicode_output == True:
        if encoding == None:
            encoding = get_encoding()
        if stdout != None:
            stdout = unicode(stdout, encoding)
        if stderr != None:
            stderr = unicode(stderr, encoding)
        libbe.LOG.debug(u'{0}\n{1}{2}'.format(status, stdout, stderr))
    else:
        libbe.LOG.debug('{0}\n{1}{2}'.format(status, stdout, stderr))
    if status not in expect:
        raise CommandError(list_args, status, stdout, stderr)
    return status, stdout, stderr


if libbe.TESTING == True:
    suite = doctest.DocTestSuite()