Example #1
0
 def resolve(self,item,captcha_cb=None,wait_cb=None):
     item = item.copy()
     url = self._url(item['url'])
     item['surl'] = url
     data = util.request(url)
     link = re.search('<a class="stahnoutSoubor.+?href=\"([^\"]+)',data)
     if link:
         url = self._url(link.group(1))
         data = util.request(url)
         m = re.search('<img src=\"(?P<img>[^\"]+)\" alt=\"Captcha\"',data)
         cap_id = re.search('<input type=\"hidden\" name=\"_uid_captcha.+?value=\"(?P<cid>[^\"]+)',data)
         if m and cap_id:
             cid = cap_id.group('cid')
             img_data = m.group('img')[m.group('img').find('base64,')+7:]
             if not os.path.exists(self.tmp_dir):
                 os.makedirs(self.tmp_dir)
             tmp_image = os.path.join(self.tmp_dir,'captcha.png')
             util.save_data_to_file(base64.b64decode(img_data),tmp_image)
             code = captcha_cb({'id':cid,'img': tmp_image})
             if not code:
                 return
             data = util.post(url+'?do=stahnoutFreeForm-submit',{'_uid_captcha':cid,'captcha':code,'stahnoutSoubor':'Stáhnout'})
             countdown = re.search('shortly\.getSeconds\(\) \+ (\d+)',data)
             last_url = re.search('<a class=\"stahnoutSoubor2.+?href=\"([^\"]+)',data)
             if countdown and last_url:
                 wait = int(countdown.group(1))
                 url = self._url(last_url.group(1))
                 wait_cb(wait)
                 req = urllib2.Request(url)
                 req.add_header('User-Agent',util.UA)    
                 resp = urllib2.urlopen(req)
                 item['url'] = resp.geturl()
                 return item
Example #2
0
 def resolve(self,item,captcha_cb=None,wait_cb=None):
     item = item.copy()
     url = self._url(item['url'])
     item['surl'] = url
     data = util.request(url)
     link = re.search('<a class="stahnoutSoubor.+?href=\"([^\"]+)',data)
     if link:
         url = self._url(link.group(1))
         data = util.request(url)
         m = re.search('<img src=\"(?P<img>[^\"]+)\" alt=\"Captcha\"',data)
         cap_id = re.search('<input type=\"hidden\" name=\"_uid_captcha.+?value=\"(?P<cid>[^\"]+)',data)
         if m and cap_id:
             cid = cap_id.group('cid')
             img_data = m.group('img')[m.group('img').find('base64,')+7:]
             if not os.path.exists(self.tmp_dir):
                 os.makedirs(self.tmp_dir)
             tmp_image = os.path.join(self.tmp_dir,'captcha.png')
             util.save_data_to_file(base64.b64decode(img_data),tmp_image)
             code = captcha_cb({'id':cid,'img': tmp_image})
             if not code:
                 return
             data = util.post(url+'?do=stahnoutFreeForm-submit',{'_uid_captcha':cid,'captcha':code,'stahnoutSoubor':'Stáhnout'})
             countdown = re.search('shortly\.getSeconds\(\) \+ (\d+)',data)
             last_url = re.search('<a class=\"stahnoutSoubor2.+?href=\"([^\"]+)',data)
             if countdown and last_url:
                 wait = int(countdown.group(1))
                 url = self._url(last_url.group(1))
                 wait_cb(wait)
                 req = urllib2.Request(url)
                 req.add_header('User-Agent',util.UA)    
                 resp = urllib2.urlopen(req)
                 item['url'] = resp.geturl()
                 resp.close()
                 return item
Example #3
0
 def _get_plot(self,data,local):
     data = util.substr(data,'<div id=\"tale_description\"','<div class=\"cleaner')
     p = data
     p = re.sub('<div[^>]+>','',p)
     p = re.sub('<table.*','',p)
     p = re.sub('</span>|<br[^>]*>|<ul>|</ul>|<hr[^>]*>','',p)
     p = re.sub('<span[^>]*>|<p[^>]*>|<li[^>]*>','',p)
     p = re.sub('<strong>|<a[^>]*>|<h[\d]+>','[B]',p)
     p = re.sub('</strong>|</a>|</h[\d]+>','[/B]',p)
     p = re.sub('</p>|</li>','[CR]',p)
     p = re.sub('<em>','[I]',p)
     p = re.sub('</em>','[/I]',p)
     p = re.sub('<img[^>]+>','',p)
     p = re.sub('\[B\]Edituj popis\[\/B\]','',p)
     p = re.sub('\[B\]\[B\]','[B]',p)
     p = re.sub('\[/B\]\[/B\]','[/B]',p)
     p = re.sub('\[B\][ ]*\[/B\]','',p)
     util.save_data_to_file(util.decode_html(''.join(p)).encode('utf-8'),local)
Example #4
0
 def download(remote, local):
     util.save_data_to_file(util.request(remote), local)
Example #5
0
 def _get_image(self,data,local):
     m = re.search('<img id=\"tale_picture\" src=\"(?P<img>[^\"]+)', data, re.IGNORECASE | re.DOTALL)
     if not m == None:
         img = self._url(m.group('img'))
         util.save_data_to_file(util.request(img),local)
Example #6
0
 def download(remote, local):
     util.save_data_to_file(util.request(remote), local)
Example #7
0
 def _get_image(self,data,local):
     data = util.substr(data,'<div class=\"entry-photo\"','</div>')
     m = re.search('<img(.+?)src=\"(?P<img>[^\"]+)', data, re.IGNORECASE | re.DOTALL)
     if not m == None:
         util.save_data_to_file(m.group('img'),local)
Example #8
0
 def _get_plot(self,data,local):
     data = util.substr(data,'<div class=\"entry-content\"','</p>')
     m = re.search('<(strong|b)>(?P<plot>(.+?))<', data, re.IGNORECASE | re.DOTALL)
     if not m == None:
         util.save_data_to_file(util.decode_html(m.group('plot')).encode('utf-8'),local)
Example #9
0
 def resolve(self, item, captcha_cb=None, select_cb=None):
     item = item.copy()
     util.init_urllib()
     url = self._url(item['url'])
     page = ''
     try:
         opener = OpenerDirector()
         opener.add_handler(HTTPHandler())
         opener.add_handler(UnknownHandler())
         install_opener(opener)
         request = Request(url)
         request.add_header('User-Agent', util.UA)
         response = urlopen(request)
         page = response.read()
         response.close()
     except HTTPError as e:
         traceback.print_exc()
         return
     data = util.substr(page, '<form method=post target=\"iframe_dwn\"',
                        '</form>')
     action = re.search('action=(?P<url>[^>]+)', data,
                        re.IGNORECASE | re.DOTALL)
     img = re.search('<img src=\"(?P<url>[^\"]+)', data,
                     re.IGNORECASE | re.DOTALL)
     if img and action:
         sessid = []
         for cookie in re.finditer('(PHPSESSID=[^\;]+)',
                                   response.headers.get('Set-Cookie'),
                                   re.IGNORECASE | re.DOTALL):
             sessid.append(cookie.group(1))
         # we have to download image ourselves
         image = util.request(self._url(img.group('url')),
                              headers={
                                  'Referer': url,
                                  'Cookie': sessid[-1]
                              })
         img_file = os.path.join(self.tmp_dir, 'captcha.png')
         util.save_data_to_file(image, img_file)
         code = None
         if captcha_cb:
             code = captcha_cb({'id': '0', 'img': img_file})
         if not code:
             self.info('No captcha received, exit')
             return
         request = urllib.urlencode({'code': code})
         req = Request(self._url(action.group('url')), request)
         req.add_header('User-Agent', util.UA)
         req.add_header('Referer', url)
         req.add_header('Cookie', sessid[-1])
         try:
             resp = urlopen(req)
             if resp.code == 302:
                 file_url = resp.headers.get('location')
             else:
                 file_url = resp.geturl()
             if file_url.find(action.group('url')) > 0:
                 msg = resp.read()
                 resp.close()
                 js_msg = re.search('alert\(\'(?P<msg>[^\']+)', msg,
                                    re.IGNORECASE | re.DOTALL)
                 if js_msg:
                     raise ResolveException(js_msg.group('msg'))
                 self.error(msg)
                 raise ResolveException(
                     'Nelze ziskat soubor, zkuste to znovu')
             resp.close()
             if file_url.find('data') >= 0 or file_url.find(
                     'download_free') > 0:
                 item['url'] = file_url
                 return item
             self.error('wrong captcha, retrying')
             return self.resolve(item, captcha_cb, select_cb)
         except HTTPError:
             traceback.print_exc()
             return
Example #10
0
class FastshareContentProvider(ContentProvider):

    def __init__(self,username=None,password=None,filter=None,tmp_dir='.'):
        ContentProvider.__init__(self,'fastshare.cz','http://www.fastshare.cz/',username,password,filter,tmp_dir)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar()))
        urllib2.install_opener(opener)

    def capabilities(self):
        return ['search','resolve']

    def search(self,keyword):
        return self.list('?term='+urllib.quote(keyword))

    def list(self,url):
        result = []
        page = util.request(self._url(url))
        data = util.substr(page,'<div class=\"search','<footer')
        for m in re.finditer('<div class=\"search-result-box(.+?)</a>',data,re.IGNORECASE | re.DOTALL ):
            it = m.group(1)
            link = re.search('<a href=([^ ]+)',it,re.IGNORECASE | re.DOTALL)
            name = re.search('title=\"([^\"]+)',it,re.IGNORECASE | re.DOTALL)
            img = re.search('<img src=\"([^\"]+)',it,re.IGNORECASE | re.DOTALL)
            size = re.search('<div class=\"fs\">([^<]+)',it,re.IGNORECASE | re.DOTALL)
            time = re.search('<div class=\"vd\">([^<]+)',it,re.IGNORECASE | re.DOTALL)
            if name and link:
                item = self.video_item()
                item['title'] = name.group(1)
                if size:
                    item['size'] = size.group(1).strip()
                if time:
                    item['length'] =  time.group(1).strip()
                item['url'] = self._url(link.group(1))
                item['img'] = self._url(img.group(1))
                self._filter(result,item)
        next = re.search('<a href=\"(?P<url>[^\"]+)[^>]+>dal',data,re.IGNORECASE | re.DOTALL) 
        if next:
            item = self.dir_item()
            item['type'] = 'next'
            item['url'] = next.group('url')
            result.append(item)
        return result


    def resolve(self,item,captcha_cb=None,select_cb=None):
        item = item.copy()        
        util.init_urllib()
        url = self._url(item['url'])
        page = ''
        try:
            opener = urllib2.OpenerDirector()
            opener.add_handler(urllib2.HTTPHandler())
            opener.add_handler(urllib2.UnknownHandler())
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            request.add_header('User-Agent',util.UA)
            response= urllib2.urlopen(request)
            page = response.read()
            response.close()
        except urllib2.HTTPError, e:
            traceback.print_exc()
            return
        data = util.substr(page,'<form method=post target=\"iframe_dwn\"','</form>')
        action = re.search('action=(?P<url>[^>]+)',data,re.IGNORECASE | re.DOTALL)
        img = re.search('<img src=\"(?P<url>[^\"]+)',data,re.IGNORECASE | re.DOTALL)
        if img and action:
            sessid=[]
            for cookie in re.finditer('(PHPSESSID=[^\;]+)',response.headers.get('Set-Cookie'),re.IGNORECASE | re.DOTALL):
                sessid.append(cookie.group(1))
            # we have to download image ourselves
            image = util.request(self._url(img.group('url')),headers={'Referer':url,'Cookie':sessid[-1]})
            img_file = os.path.join(self.tmp_dir,'captcha.png')
            util.save_data_to_file(image,img_file)
            code = None
            if captcha_cb:
                code = captcha_cb({'id':'0','img':img_file})
            if not code:
                self.info('No captcha received, exit')
                return
            request = urllib.urlencode({'code':code})
            req = urllib2.Request(self._url(action.group('url')),request)
            req.add_header('User-Agent',util.UA)
            req.add_header('Referer',url)
            req.add_header('Cookie',sessid[-1])
            try:
                resp = urllib2.urlopen(req)
                if resp.code == 302:
                    file_url = resp.headers.get('location')
                else:
                    file_url = resp.geturl()
                if file_url.find(action.group('url')) > 0:            
                    msg = resp.read()
                    resp.close()
                    js_msg = re.search('alert\(\'(?P<msg>[^\']+)',msg,re.IGNORECASE | re.DOTALL)
                    if js_msg:
                        raise ResolveException(js_msg.group('msg'))
                    self.error(msg)
                    raise ResolveException('Nelze ziskat soubor, zkuste to znovu')
                resp.close()
                if file_url.find('data') >=0 or file_url.find('download_free') > 0:
                    item['url'] = file_url
                    return item
                self.error('wrong captcha, retrying')
                return self.resolve(item,captcha_cb,select_cb)
            except urllib2.HTTPError:
                traceback.print_exc()
                return
from bs4 import BeautifulSoup
import requests

from util import save_data_to_file, extract_data

url = "http://www.dialadeliverykenya.co.ke/chicken-inn-menu"
json_file = "chicken_inn.json"

page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

items = soup.find_all('div', class_='tab-inner chicken-padder')

structured_items = [extract_data(item, url) for item in items]

save_data_to_file(structured_items, json_file)