Beispiel #1
0
def navsenter(lev1, lev2):
    #print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
    #print 'wchodze do navsenter'
    global cats
    global navs
    global liczbaproduktowlev2allstep
    global liczbaproduktowlev2all
    #print cats[lev1]['name']+':'+cats[lev1]['lev2'][lev2]['name']
    elemnastrone=45
    baseurl = 'http://sharg.pl/ajax/items_list.php?c_id='+str(lev2)+'&items_count='+str(elemnastrone)
    soup = getfileofurl(baseurl)
    s = re.compile(r'Liczba produktów: (?P<word>.+)')
    n = s.search(str(soup))
    liczbaproduktowlev2=int(n.group('word'))
    #print 'liczbaproduktowlev2='+str(liczbaproduktowlev2)
    liczbaproduktowlev2all+=liczbaproduktowlev2
    liczbastron=int(ceil(liczbaproduktowlev2/float(elemnastrone)))
    #print 'liczbastron='+str(liczbastron)
    liczbaproduktowlev2step=0
    for i in range(liczbastron):
        url=baseurl+'&page_index='+str(i+1)
        print url
        soup = getfileofurl(url)
        t = re.compile(r'\[(?P<word>T.+)\]')
        o = t.findall(str(soup))
        for nav in o:
            if nav == 'T001885':
                logfile = open('f****d.html', 'w')
                logfile.write(unescape(str(soup)))
                logfile.close()
                print nav
                sys.exit(1)
            if nav in navs:
                #print 'dupl ^^^ here'
                if lev1 in navs[nav]['cats']:
                    navs[nav]['cats'][lev1].add(lev2)
                else:
                    navs[nav]['cats'][lev1]=set()
                    navs[nav]['cats'][lev1].add(lev2)
            else:
                navs[nav]={}
                navs[nav]['cats']={}
                navs[nav]['cats'][lev1]=set()
                navs[nav]['cats'][lev1].add(lev2)
            liczbaproduktowlev2step+=1
    liczbaproduktowlev2allstep+=liczbaproduktowlev2step
    """
    print 'liczbaproduktowlev2='+str(liczbaproduktowlev2)
    print 'liczbaproduktowlev2step='+str(liczbaproduktowlev2step)
    print 'liczbaproduktowlev2all='+str(liczbaproduktowlev2all)
    print 'liczbaproduktowlev2allstep='+str(liczbaproduktowlev2allstep)
    #"""
    if not liczbaproduktowlev2step == liczbaproduktowlev2:
        print 'bad!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! '+str(liczbaproduktowlev2step)+'!='+str(liczbaproduktowlev2)
        sys.exit(1)
Elements are written as "<tag>text</tag>".

In [51]: print(html.escape(s))
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.

In [52]: print(html.escape(s, quote=False))
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".

In [53]: s = 'Spicy Jalapeño'

In [55]: s.encode('ascii', errors='xmlcharrefreplace')
Out[55]: b'Spicy Jalape&#241;o'

In [56]: s = 'Spicy &quot;Jalape&#241;o&quot.'

In [57]: from html.parser import HTMLParser

In [58]: p = HTMLParser()

In [59]: p.unescape(s)
d:\Program Files\Anaconda3\Scripts\ipython-script.py:1: DeprecationWarning: The
unescape method is deprecated and will be removed in 3.5, use html.unescape() in
stead.
  if __name__ == '__main__':
Out[59]: 'Spicy "Jalapeño".'

In [62]: from xml.sax.saxutils import unescape

In [63]: unescape(t)
Out[63]: 'The prompt is >>>'
--- docutils/utils/__init__.py.orig	2019-04-07 08:08:42 UTC
+++ docutils/utils/__init__.py
@@ -18,6 +18,7 @@ import warnings
 import unicodedata
 from docutils import ApplicationError, DataError, __version_info__
 from docutils import nodes
+from docutils.nodes import unescape
 import docutils.io
 from docutils.utils.error_reporting import ErrorOutput, SafeString
 
@@ -576,18 +577,7 @@ def escape2null(text):
         parts.append('\x00' + text[found+1:found+2])
         start = found + 2               # skip character after escape
 
-def unescape(text, restore_backslashes=False, respect_whitespace=False):
-    """
-    Return a string with nulls removed or restored to backslashes.
-    Backslash-escaped spaces are also removed.
-    """
-    # `respect_whitespace` is ignored (since introduction 2016-12-16)
-    if restore_backslashes:
-        return text.replace('\x00', '\\')
-    else:
-        for sep in ['\x00 ', '\x00\n', '\x00']:
-            text = ''.join(text.split(sep))
-        return text
+# `unescape` definition moved to `nodes` to avoid circular import dependency.
 
 def split_escaped_whitespace(text):
     """
Beispiel #4
0
if os.path.isfile(fname): os.remove(fname)

navs={}
liczbaproduktowlev2allstep=0

print "\nnow categories"
url='http://sharg.pl/index.php'
soup = getfileofurl(url)
cats={}
liczbaproduktowlev1all=0
liczbaproduktowlev2all=0
p = re.compile(r'c_id=(?P<word>.+)&')
for i in soup('div',"menu"):
    m=p.search(i.a['href'])
    key=m.group('word')
    value=unescape(i.a.string.strip())
    cats[key]={'name':value}
    cats[key]['lev2']={}

for k,v in cats.items():
    url = 'http://sharg.pl/items.php?c_id='+str(k)
    soup = getfileofurl(url)
    s = re.compile(r'Liczba produktów: (?P<word>.+)')
    n = s.search(str(soup))
    liczbaproduktowlev1=int(n.group('word'))
    #print "cccccccccccccccccccccccc"
    #print 'liczbaproduktowlev1='+str(liczbaproduktowlev1)
    liczbaproduktowlev1all+=liczbaproduktowlev1
    p = re.compile(r'c_id=(?P<word>.+)&')
    for i in soup('div',"menu"):
        if "&nbsp;|&nbsp;" in str(i):