Beispiel #1
0
    def check(self, instance):
        config_url = instance.get('url')
        added_tags = instance.get('tags')
        if config_url is None:
            raise Exception("An url must be specified")

        # Load basic authentication configuration, if available.
        username, password = instance.get('username'), instance.get('password')
        if username and password:
            auth = (username, password)
        else:
            auth = None

        # Support URLs that have a path in them from the config, for
        # backwards-compatibility.
        parsed = urlparse.urlparse(config_url)
        if parsed.path != "":
            config_url = "%s://%s" % (parsed.scheme, parsed.netloc)

        # Tag by URL so we can differentiate the metrics from multiple instances
        tags = ['url:%s' % config_url]
        if added_tags is not None:
            for tag in added_tags:
                tags.append(tag)

        # Load stats data.
        url = urlparse.urljoin(config_url, STATS_URL)
        stats_data = self._get_data(url, auth)
        self._process_stats_data(config_url, stats_data, auth, tags=tags)

        # Load the health data.
        url = urlparse.urljoin(config_url, HEALTH_URL)
        health_data = self._get_data(url, auth)
        self._process_health_data(config_url, health_data, tags=tags)
Beispiel #2
0
def search_by_build_id(hex_encoded_id):
    """
    Given a hex-encoded Build ID, return the path to an ELF with that Build ID
    only the local system.

    If it can't be found, return None.
    """
    cache = cache_dir + hex_encoded_id

    if os.path.exists(cache) and read(cache).startswith('\x7FELF'):
        log.info_once("Using cached data from %r" % cache)
        return cache

    log.info("Downloading data from GitHub")

    url_base = "https://gitlab.com/libcdb/libcdb/raw/master/hashes/build_id/"
    url      = urlparse.urljoin(url_base, hex_encoded_id)

    data   = ""
    while not data.startswith('\x7fELF'):
        data = wget(url)

        if not data:
            return None

        if data.startswith('..'):
            url = os.path.dirname(url) + '/'
            url = urlparse.urljoin(url, data)

    write(cache, data)
    return cache
Beispiel #3
0
 def ogone_form_generate_values(self, cr, uid, id, values, context=None):
     base_url = self.pool['ir.config_parameter'].get_param(cr, uid, 'web.base.url')
     acquirer = self.browse(cr, uid, id, context=context)
     ogone_tx_values = dict(values)
     temp_ogone_tx_values = {
         'PSPID': acquirer.ogone_pspid,
         'ORDERID': values['reference'],
         'AMOUNT': float_repr(float_round(values['amount'], 2) * 100, 0),
         'CURRENCY': values['currency'] and values['currency'].name or '',
         'LANGUAGE': values.get('partner_lang'),
         'CN': values.get('partner_name'),
         'EMAIL': values.get('partner_email'),
         'OWNERZIP': values.get('partner_zip'),
         'OWNERADDRESS': values.get('partner_address'),
         'OWNERTOWN': values.get('partner_city'),
         'OWNERCTY': values.get('partner_country') and values.get('partner_country').code or '',
         'OWNERTELNO': values.get('partner_phone'),
         'ACCEPTURL': '%s' % urlparse.urljoin(base_url, OgoneController._accept_url),
         'DECLINEURL': '%s' % urlparse.urljoin(base_url, OgoneController._decline_url),
         'EXCEPTIONURL': '%s' % urlparse.urljoin(base_url, OgoneController._exception_url),
         'CANCELURL': '%s' % urlparse.urljoin(base_url, OgoneController._cancel_url),
         'PARAMPLUS': 'return_url=%s' % ogone_tx_values.pop('return_url') if ogone_tx_values.get('return_url') else False,
     }
     if values.get('type') == 'form_save':
         temp_ogone_tx_values.update({
             'ALIAS': 'ODOO-NEW-ALIAS-%s' % time.time(),    # something unique,
             'ALIASUSAGE': values.get('alias_usage') or acquirer.ogone_alias_usage,
         })
     shasign = self._ogone_generate_shasign(acquirer, 'in', temp_ogone_tx_values)
     temp_ogone_tx_values['SHASIGN'] = shasign
     ogone_tx_values.update(temp_ogone_tx_values)
     return ogone_tx_values
Beispiel #4
0
 def get_sub(self):
     """Fetches the subtitles from addic7ed from url specified in given database (db) for that episode"""
     url_split = urlparse.urlsplit (self.url)
     head,  tail = url_split.path.rsplit ('/', 1)
     new_path =  head,  'addic7ed'
     referer = urlparse.urlunsplit(url_split._replace(path=urlparse.urljoin(*new_path)))
     
     domain = self.url
     response = urllib2.urlopen(domain)#Opens the url
     html = response.read ()#loads the html code
     soup = BeautifulSoup (html)#interprets (parse?) the html code
     links = []
     for x in soup.find_all (class_ ="buttonDownload"):
         links.append (x.attrs['href'])
     
     domain = 'http://www.addic7ed.com/'
     urls = []
     for link in links:
         urls.append (urlparse.urljoin (domain, link))
     
     page = urls[0]
     req = urllib2.Request(page,  headers ={'User-Agent' : 'Mozilla 5.10', 'Referer' : referer})
     response = urllib2.urlopen (req)
     data = response.read()
     
     test = response.info()
     print test
     
     if response.info().has_key('Content-Disposition'):
         with open(os.path.join(self.db.env.subs_dir ,'%s.srt' % self.title), 'wb') as f:
             f.write(data)
     else:
         return response.info()
Beispiel #5
0
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources
            url = urlparse.urljoin(self.base_link, url)
            for i in range(3):
                result = client.request(url, timeout=10)
                if not result == None: break
            
            dom = dom_parser.parse_dom(result, 'div', attrs={'class':'links', 'id': 'noSubs'})
            result = dom[0].content
            
            links = re.compile('<tr\s*>\s*<td><i\s+class="fa fa-youtube link-logo"></i>([^<]+).*?href="([^"]+)"\s+class="watch',re.DOTALL).findall(result)         
            for link in links[:5]:
                try:
                    url2 = urlparse.urljoin(self.base_link, link[1])
                    for i in range(2):
                        result2 = client.request(url2, timeout=3)
                        if not result2 == None: break                    
                    r = re.compile('href="([^"]+)"\s+class="action-btn').findall(result2)[0]
                    valid, hoster = source_utils.is_host_valid(r, hostDict)
                    if not valid: continue
                    urls, host, direct = source_utils.check_directstreams(r, hoster)
                    for x in urls: sources.append({'source': host, 'quality': x['quality'], 'language': 'en', 'url': x['url'], 'direct': direct, 'debridonly': False})
                    
                except:
                    #traceback.print_exc()
                    pass           
                    
            return sources
        except:
            return sources
Beispiel #6
0
def novedades(item):
    logger.info("pelisalacarta.channels.animeflv novedades")

    # Descarga la pagina
    data = scrapertools.cache_page(item.url)

    # Extrae las entradas (carpetas)  
    '''
    <div class="not">
    <a href="/ver/cyclops-shoujo-saipu-12.html" title="Cyclops Shoujo Saipu 12">
    <img class="imglstsr lazy" src="http://cdn.animeflv.net/img/mini/957.jpg" border="0">
    <span class="tit_ep"><span class="tit">Cyclops Shoujo Saipu 12</span></span>
    </a>
    '''
    patronvideos = '<div class="not"[^<]+<a href="([^"]+)" title="([^"]+)"[^<]+<img class="[^"]+" src="([^"]+)"[^<]+' \
                   '<span class="tit_ep"><span class="tit">([^<]+)<'
    matches = re.compile(patronvideos, re.DOTALL).findall(data)
    itemlist = []
    
    for match in matches:
        scrapedtitle = scrapertools.entityunescape(match[3])
        fulltitle = scrapedtitle
        # directory = match[1]
        scrapedurl = urlparse.urljoin(item.url, match[0])
        scrapedthumbnail = urlparse.urljoin(item.url, match[2].replace("mini", "portada"))
        scrapedplot = ""
        if DEBUG:
            logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")

        itemlist.append(Item(channel=__channel__, action="findvideos", title=scrapedtitle, url=scrapedurl,
                             thumbnail=scrapedthumbnail, plot=scrapedplot, fulltitle=fulltitle, viewmode="movie"))

    return itemlist
Beispiel #7
0
 def handle_captcha(self, response, solver):
     sel = scrapy.Selector(response)
     iframe_src = sel.xpath(self.CAPTCHA_XPATH).extract()[0]
     iframe_url = urljoin(response.url, iframe_src)
     iframe_request = scrapy.Request(iframe_url)
     iframe_response = yield download(self.crawler, iframe_request)
     iframe_sel = scrapy.Selector(iframe_response)
     img_src, = iframe_sel.xpath('//img/@src').extract()[:1] or [None]
     if img_src is None:
         raise DecaptchaError('No //img/@src found on CAPTCHA page')
     img_url = urljoin(iframe_response.url, img_src)
     img_request = scrapy.Request(img_url)
     img_response = yield download(self.crawler, img_request)
     scrapy.log.msg('CAPTCHA image downloaded, solving')
     captcha_text = yield solver.solve(img_response.body)
     scrapy.log.msg('CAPTCHA solved: %s' % captcha_text)
     challenge_request = scrapy.FormRequest.from_response(
         iframe_response, formxpath='//form',
         formdata={'recaptcha_response_field': captcha_text}
     )
     challenge_response = yield download(self.crawler, challenge_request)
     challenge_sel = scrapy.Selector(challenge_response)
     challenge, = challenge_sel.xpath(
         '//textarea/text()'
     ).extract()[:1] or [None]
     if not challenge:
         raise DecaptchaError('Bad challenge from reCAPTCHA API:\n%s' %
                              challenge_response.body)
     scrapy.log.msg('CAPTCHA solved, submitting challenge')
     submit_request = scrapy.FormRequest.from_response(
         response, formxpath='//form[.%s]' % self.CAPTCHA_XPATH,
         formdata={'recaptcha_challenge_field': challenge}
     )
     yield download(self.crawler, submit_request)
Beispiel #8
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for div in hxs.select('//div[@id="contem_boxes"]'):
            titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0]

            if not titulo.endswith(u'mara dos Deputados/BR'):
                continue
            else:
                reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U)
                for r in reg.finditer(div.extract()):
                    dict_deputy = r.groupdict()
                    #if dict_deputy['state'] in settings['STATE_TO_FILTER']:
                    db_deputy = self.api.get_deputado_por_nome(dict_deputy['name'])
                    if not db_deputy:
                        dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party'])
                        self.api.inserir_deputado(dep)
                    else:
                        dep = db_deputy[0]

                    id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0]
                    if not id:
                        continue
                    request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity)
                    request.meta['dep'] = dep
                    yield request
                    
                    request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs)
                    request.meta['dep'] = dep
                    yield request
Beispiel #9
0
def getL10nRepositories(changesets, l10nRepoPath, relbranch=None):
    """Parses a list of locale names and revisions for their associated
       repository from the 'changesets' string passed in."""
    # urljoin() will strip the last part of l10nRepoPath it doesn't end with
    # "/"
    if not l10nRepoPath.endswith('/'):
        l10nRepoPath = l10nRepoPath + '/'
    repositories = {}
    try:
        for locale, data in json.loads(changesets).iteritems():
            locale = urljoin(l10nRepoPath, locale)
            repositories[locale] = {
                'revision': data['revision'],
                'relbranchOverride': relbranch,
                'bumpFiles': []
            }
    except (TypeError, ValueError):
        for locale, revision in parsePlainL10nChangesets(changesets).iteritems():
            if revision == 'FIXME':
                raise Exception('Found FIXME in changesets for locale "%s"' % locale)
            locale = urljoin(l10nRepoPath, locale)
            repositories[locale] = {
                'revision': revision,
                'relbranchOverride': relbranch,
                'bumpFiles': []
            }

    return repositories
Beispiel #10
0
def peliscat(item):
	logger.info("[cinegratis.py] peliscat")

	url = item.url

	itemlist = []
	itemlist.append( Item(channel=CHANNELNAME, action="listsimple" , title="Versión original" , url="http://www.cinegratis.net/index.php?module=search&title=subtitulado"))
	itemlist.append( Item(channel=CHANNELNAME, action="listsimple" , title="Versión latina"   , url="http://www.cinegratis.net/index.php?module=search&title=latino"))

	# Descarga la página
	data = scrapertools.cachePage(url)

	# Extrae los items
	patronvideos  = "<td align='left'><a href='([^']+)'><img src='([^']+)' border='0'></a></td>"
	matches = re.compile(patronvideos,re.DOTALL).findall(data)
	scrapertools.printMatches(matches)

	for match in matches:
		# Atributos
		patron2 = "genero/([A-Za-z\-]+)/"
		matches2 = re.compile(patron2,re.DOTALL).findall(match[0])
		scrapertools.printMatches(matches2)
		
		scrapedtitle = matches2[0]
		scrapedurl = urlparse.urljoin(url,match[0])
		scrapedthumbnail = urlparse.urljoin(url,match[1])
		scrapedplot = ""
		if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")

		itemlist.append( Item(channel=CHANNELNAME, action="listvideos" , title=scrapedtitle , url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot))

	return itemlist
def novedades(item):
    logger.info("[serieonline.py] novedades")

    # Descarga la página
    data = scrapertools.cachePage(item.url)

    # Extrae las entradas
    patronvideos  = '<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" alt="([^"]+)" class="captify" /></a>'

    matches = re.compile(patronvideos,re.DOTALL).findall(data)
    if DEBUG: scrapertools.printMatches(matches)

    itemlist = []
    for match in matches:
        scrapedtitle = match[1] + " " + match[3]
        scrapedplot = ""
        scrapedurl = urlparse.urljoin(item.url,match[0])
        scrapedthumbnail = urlparse.urljoin(item.url,match[2])
        if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")

        # Añade al listado de XBMC
        itemlist.append( Item(channel=CHANNELNAME, action="findvideos", title=scrapedtitle , url=scrapedurl , thumbnail=scrapedthumbnail , plot=scrapedplot , folder=True) )

    # Extrae el paginador
    patronvideos  = '<div class="paginacion-num"><a href="([^"]+)">'
    matches = re.compile(patronvideos,re.DOTALL).findall(data)
    scrapertools.printMatches(matches)

    if len(matches)>0:
        scrapedtitle = "Página siguiente"
        scrapedurl = urlparse.urljoin(item.url,matches[0])
        itemlist.append( Item(channel=CHANNELNAME, action="novedades", title=scrapedtitle , url=scrapedurl , folder=True) )

    return itemlist
def search(item,texto):
    logger.info("[pelisalacarta.seriesblanco search texto="+texto)

    itemlist = []

    item.url = urlparse.urljoin(host,"/search.php?q1=%s" % (texto))
    data = scrapertools.cache_page(item.url)
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;|<Br>|<BR>|<br>|<br/>|<br />|-\s","",data)
    data = re.sub(r"<!--.*?-->","",data)

    #<div style='float:left;width: 620px;'><div style='float:left;width: 33%;text-align:center;'><a href='/serie/20/against-the-wall.html' '><img class='ict' src='http://4.bp.blogspot.com/-LBERI18Cq-g/UTendDO7iNI/AAAAAAAAPrk/QGqjmfdDreQ/s320/Against_the_Wall_Seriesdanko.jpg' alt='Capitulos de: Against The Wall' height='184' width='120'></a><br><div style='text-align:center;line-height:20px;height:20px;'><a href='/serie/20/against-the-wall.html' style='font-size: 11px;'> Against The Wall</a></div><br><br>

    patron = "<img class='ict' src='([^']+)'.*?<div style='text-align:center;line-height:20px;height:20px;'><a href='([^']+)' style='font-size: 11px;'>([^<]+)</a>"

    matches = re.compile(patron,re.DOTALL).findall(data)

    for scrapedthumbnail, scrapedurl, scrapedtitle in matches:
        
        
        itemlist.append( Item(channel=__channel__, title =scrapedtitle , url=urlparse.urljoin(host,scrapedurl), action="episodios", thumbnail=scrapedthumbnail, fanart ="http://portfolio.vernier.se/files/2014/03/light-grey-wood-photography-hd-wallpaper-1920x1200-46471.jpg", show=scrapedtitle) )

    try:
        return itemlist
    # Se captura la excepción, para no interrumpir al buscador global si un canal falla
    except:
        import sys
        for line in sys.exc_info():
            logger.error( "%s" % line )
        return []
def processJob(jobDetails):
    try:
        job = {}
        url = urljoin(rootUrl, jobDetails.a['href'])
        soup = thisInstitution.getSoup(url)
        subLinks = soup.select('.pinkbox_heading a')
        if subLinks:
            for link in subLinks:
                job['url'] = urljoin(rootUrl, link['href'])
                job['title'] = link.get_text()
                print job['title']
                job["language"] = 'de'                
                jobPage = thisInstitution.getSoup(job['url'])
                content = jobPage.find(id='contentblock')
                job['text'] = unicode(content)
                thisInstitution.addRecord(job) 
        else:
            job['url'] = url
            job['title'] = jobDetails.a.get_text()
            print job['title']
            job["language"] = 'de'                
            content = soup.find(id='contentblock')
            job['text'] = unicode(content)          
            thisInstitution.addRecord(job)  
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False
Beispiel #14
0
 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         url = urlparse.urljoin(self.base_url, source_url)
         page_html = self._http_get(url, cache_limit=.5)
         movie_id = dom_parser.parse_dom(page_html, 'div', {'id': 'media-player'}, 'movie-id')
         if movie_id:
             server_url = SL_URL % (movie_id[0])
             url = urlparse.urljoin(self.base_url, server_url)
             html = self._http_get(url, cache_limit=.5)
             sources = {}
             for match in re.finditer('changeServer\(\s*(\d+)\s*,\s*(\d+)\s*\).*?class="btn-eps[^>]*>([^<]+)', html, re.DOTALL):
                 link_type, link_id, q_str = match.groups()
                 if link_type in ['12', '13', '14']:
                     url = urlparse.urljoin(self.base_url, PLAYLIST_URL1 % (link_id))
                     sources.update(self.__get_link_from_json(url, q_str))
                 else:
                     media_url = self.__get_ep_pl_url(link_type, page_html)
                     if media_url:
                         url = urlparse.urljoin(self.base_url, media_url)
                         xml = self._http_get(url, cache_limit=.5)
                         sources.update(self.__get_links_from_xml(xml, video))
             
         for source in sources:
             if sources[source]['direct']:
                 host = self._get_direct_hostname(source)
             else:
                 host = urlparse.urlparse(source).hostname
             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': sources[source]['quality'], 'views': None, 'rating': None, 'url': source, 'direct': sources[source]['direct']}
             hosters.append(hoster)
     return hosters
Beispiel #15
0
 def __init__(self, layer, mapfile, fonts=None):
     """ Initialize Mapnik provider with layer and mapfile.
         
         XML mapfile keyword arg comes from TileStache config,
         and is an absolute path by the time it gets here.
     """
     maphref = urljoin(layer.config.dirpath, mapfile)
     scheme, h, path, q, p, f = urlparse(maphref)
     
     if scheme in ('file', ''):
         self.mapfile = path
     else:
         self.mapfile = maphref
     
     self.layer = layer
     self.mapnik = None
     
     engine = mapnik.FontEngine.instance()
     
     if fonts:
         fontshref = urljoin(layer.config.dirpath, fonts)
         scheme, h, path, q, p, f = urlparse(fontshref)
         
         if scheme not in ('file', ''):
             raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
     
         for font in glob(path.rstrip('/') + '/*.ttf'):
             engine.register_font(str(font))
Beispiel #16
0
def get_favicon_url(url):
    if not url.startswith('http'):
        url = "http://{0}".format(url)

    # Check if the root location has a favicon before parsing for it
    if _has_root_favicon(url):
        return urlparse.urljoin(url, 'favicon.ico')

    headers = {'User-Agent': 'Mozilla/5.0'}
    request = urllib2.Request(url, None, headers)

    website = urllib2.urlopen(request).read()

    soup = BeautifulSoup(website)
    favicon_element = soup.find("link", rel="shortcut icon")

    if favicon_element:
        hostname = urlparse.urlparse(url).hostname
        favicon_url = favicon_element['href']

        if favicon_url.startswith('//cdn'):
            return "http:" + favicon_url
        # favicon url is relative and must be converted to absolute path
        elif hostname not in favicon_url:
            return urlparse.urljoin(url, favicon_url)
        else:
            return favicon_url
    else:
        return None
Beispiel #17
0
    def as_obi_serialization(self, request=None):
        """Produce an Open Badge Infrastructure serialization of this badge"""
        if request:
            base_url = request.build_absolute_uri('/')
        else:
            base_url = 'http://%s' % (Site.objects.get_current().domain,)

        # see: https://github.com/brianlovesdata/openbadges/wiki/Assertions
        if not self.creator:
            issuer = SITE_ISSUER
        else:
            issuer = {
                # TODO: Get from user profile instead?
                "origin": urljoin(base_url, self.creator.get_absolute_url()),
                "name": self.creator.username,
                "contact": self.creator.email
            }

        data = {
            # The version of the spec/hub this manifest is compatible with. Use
            # "0.5.0" for the beta.
            "version": OBI_VERSION,
            # TODO: truncate more intelligently
            "name": self.title[:128],
            # TODO: truncate more intelligently
            "description": self.description[:128],
            "criteria": urljoin(base_url, self.get_absolute_url()),
            "issuer": issuer
        }
        if self.image:
            data['image'] = urljoin(base_url, self.image.url)
        return data
    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            
            fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'})
            if fragment:
                for match in re.finditer('href="([^"]+)', fragment[0]):
                    match = re.search('movie-player/(.*)', match.group(1))
                    if match:
                        player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                        html = self._http_get(player_url, cache_limit=.5)
                        match = re.search('<source\s+src="([^"]+)', html)
                        if match:
                            stream_url = match.group(1)
                            hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True}
                            hosters.append(hoster)
                        
                        fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'})
                        if fragment2:
                            for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]):
                                other_url, quality = match.groups()
                                match = re.search('movie-player/(.*)', other_url)
                                if match:
                                    other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                                    if other_url == player_url: continue
                                    hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True}
                                    hosters.append(hoster)

        return hosters
Beispiel #19
0
 def search(self, video_type, title, year):
     results = []
     norm_title = self._normalize_title(title)
     if video_type == VIDEO_TYPES.TVSHOW:
         for server_url in TVSHOW_URLS:
             for row in self.__parse_directory(self._http_get(server_url, cache_limit=48)):
                 match_year = ''
                 if norm_title in self._normalize_title(row['title']) and (not year or not match_year or year == match_year):
                     result = {'url': urlparse.urljoin(server_url, row['link']), 'title': row['title'], 'year': match_year}
                     results.append(result)
     else:
         search_url = urlparse.urljoin(self.base_url, '/?s=')
         search_url += urllib.quote_plus(title)
         html = self._http_get(search_url, cache_limit=1)
         for article in dom_parser.parse_dom(html, 'article', {'class': 'entry-body'}):
             link = dom_parser.parse_dom(article, 'a', {'class': 'more-link'}, 'href')
             content = dom_parser.parse_dom(article, 'div', {'class': 'post-content'})
             match = re.search('</a>\s*([^<]+)', content[0]) if content else ''
             info = dom_parser.parse_dom(article, 'div', {'class': 'post-info'})
             is_movie = re.search('/category/movies/', info[0]) if info else False
             if match and link and is_movie:
                 match_title_year = match.group(1)
                 match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
                 if match:
                     match_title, match_year = match.groups()
                 else:
                     match_title = match_title_year
                     match_year = ''
                 
                 if not year or not match_year or year == match_year:
                     result = {'url': self._pathify_url(link[0]), 'title': match_title, 'year': match_year}
                     results.append(result)
     
     return results
def findVideoFrameLink(page, data):

    minheight = 300
    minwidth = 300

    frames = findFrames(data)
    if not frames:
        return None

    iframes = regexUtils.findall(
        data,
        "(frame(?![^>]*cbox\.ws)(?![^>]*Publi)(?![^>]*chat\d*\.\w+)(?![^>]*ad122m)(?![^>]*adshell)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*\sheight\s*=\s*[\"']*([\%\d]+)(?:px)?[\"']*[^>]*>)",
    )

    if iframes:
        for iframe in iframes:
            if iframe[1] == "100%":
                height = minheight + 1
            else:
                height = int(iframe[1])
            if height > minheight:
                m = regexUtils.findall(iframe[0], "[\"' ]width\s*=\s*[\"']*(\d+[%]*)(?:px)?[\"']*")
                if m:
                    if m[0] == "100%":
                        width = minwidth + 1
                    else:
                        width = int(m[0])
                    if width > minwidth:
                        m = regexUtils.findall(iframe[0], "['\"\s]src=[\"']*\s*([^>\"' ]+)\s*[>\"']*")
                        if m:
                            return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    # Alternative 1
    iframes = regexUtils.findall(
        data, '(frame(?![^>]*cbox\.ws)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*["; ]height:\s*(\d+)[^>]*>)'
    )
    if iframes:
        for iframe in iframes:
            height = int(iframe[1])
            if height > minheight:
                m = regexUtils.findall(iframe[0], '["; ]width:\s*(\d+)')
                if m:
                    width = int(m[0])
                    if width > minwidth:
                        m = regexUtils.findall(iframe[0], '["; ]src=["\']*\s*([^>"\' ]+)\s*[>"\']*')
                        if m:
                            return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    # Alternative 2 (Frameset)
    m = regexUtils.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"')
    if m:
        return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    m = regexUtils.findall(
        data, '<a href="([^"]+)" target="_blank"><img src="[^"]+" height="450" width="600" longdesc="[^"]+"/></a>'
    )
    if m:
        return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    return None
Beispiel #21
0
 def parse_list(self, response):
     hxs = HtmlXPathSelector(response)
     for href in hxs.select(r'//ul[@id="paper-listing"]//a/@href').extract():
         yield Request(urlparse.urljoin(response.url, href), callback=self.parse_paper)
     next = hxs.select(r'//div[@class="pagination"]/ul/li[@class="next"]/a/@href')
     if len(next):
         yield Request(urlparse.urljoin(response.url, next[0].extract()), callback=self.parse_list)
Beispiel #22
0
    def get_sources(self, url, hosthdDict, hostDict, locDict):
        try:
            sources = []

            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)

            result = client.source(url)

            video_id = re.compile('video_id *= *[\'|\"](.+?)[\'|\"]').findall(result)[0]
            post = urllib.urlencode({'video_id': video_id})

            result = client.source(urlparse.urljoin(self.base_link, self.info_link), post=post)

            u = [i for i in result.split('&') if 'google' in i][0]
            u = urllib.unquote_plus(u)
            u = [urllib.unquote_plus(i.split('|')[-1]) for i in u.split(',')]
            u = [googleplus.tag(i)[0] for i in u]
            u = [i for i in u if i['quality'] in ['1080p', 'HD']]

            for i in u: sources.append({'source': 'GVideo', 'quality': i['quality'], 'provider': 'Afdah', 'url': i['url']})

            return sources
        except:
            return sources
Beispiel #23
0
 def parse(self, response):
     self._logger.info("start response in parse -> response type:%s"%type(response).__name__)
     item_urls = [
         urljoin(response.url, x) for x in list(set(
             response.xpath('//div[@id="resultsCol"]//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract()
         ))
     ]
     self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], len(item_urls))
     for item_url in item_urls:
         yield Request(url=item_url,
                       callback=self.parse_item,
                       meta=response.meta)
     workers = response.meta.get('workers', {})
     for worker in workers.keys():
         workers[worker] = 0
     if "if_next_page" in response.meta: del response.meta["if_next_page"]
     next_page_urls = [
         urljoin(response.url, x) for x in list(set(
             response.xpath('//div[@id="pagn"]//span[@class="pagnRA"]/a/@href').extract()
         ))
     ]
     response.meta["if_next_page"] = True
     for next_page_url in next_page_urls:
         yield Request(url=next_page_url,
                       callback=self.parse,
                       meta=response.meta)
Beispiel #24
0
def mainlist(item):
    logger.info()

    thumb_series    = get_thumb("squares", "thumb_canales_series.png")
    thumb_series_az = get_thumb("squares", "thumb_canales_series_az.png")
    thumb_buscar    = get_thumb("squares", "thumb_buscar.png")

    itemlist = []
    itemlist.append(Item(channel=item.channel, title="Listado alfabético", action="series_listado_alfabetico",
                         thumbnail=thumb_series_az))
    itemlist.append(Item(channel=item.channel, title="Todas las series", action="series",
                         url=urlparse.urljoin(HOST, "listado/"), thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Capítulos de estreno", action="homeSection", extra=CAPITULOS_DE_ESTRENO_STR,
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Último actualizado", action="homeSection", extra="Último Actualizado",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Series más vistas", action="homeSection", extra="Series Más vistas",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Series menos vistas", action="homeSection", extra="Series Menos vistas",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Últimas fichas creadas", action="series",
                         url=urlparse.urljoin(HOST, "fichas_creadas/"), thumbnail=thumb_series))

    itemlist.append(Item(channel=item.channel, title="Buscar...", action="search", url=HOST, thumbnail=thumb_buscar))

    if filtertools.context:
        itemlist = filtertools.show_option(itemlist, item.channel, list_idiomas, CALIDADES)

    return itemlist
Beispiel #25
0
	def parseImgLinks(self,depth=1):
		url_response = None
		try:
			url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
		except Exception as e:
			print("   [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
			return self.img_list

		html_parse = BeautifulSoup(url_response)
		unique_images_found = 0
		total_images_found = 0
		self.visited[self.scrap_url] = 1

		for img in html_parse.findAll('img'):
			try:
				abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
				if abs_url not in self.img_list:
					self.img_list.add(abs_url)
					unique_images_found += 1
				total_images_found += 1
			except:
				pass

		print("   [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
		if depth > 1:
			for a in html_parse.findAll('a'):
				try:
					if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
						self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
						if self.scrap_url in self.visited: continue
						self.parseImgLinks(depth - 1)
				except:
					pass
		return self.img_list
def choose_reference(experiment, biorep_n, server, keypair, sex_specific):

    replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']]
    replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n)
    logging.debug('Replicate uuid %s' %(replicate.get('uuid')))
    organism_uri = replicate.get('library').get('biosample').get('organism')
    organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair)

    try:
        organism_name = organism_obj['name']
    except:
        logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n))
        raise
        return None
    else:
        logging.debug("Organism name %s" %(organism_name))

    if sex_specific:
        try:
            sex = replicate.get('library').get('biosample').get('sex')
            assert sex in ['male', 'female']
        except:
            logging.warning('%s:rep%d Sex is %s.  Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex))
            sex = 'male'

        logging.debug('Organism %s sex %s' %(organism_name, sex))
    else:
        sex = 'male'
    
    genome_assembly = args.assembly

    reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None)
    logging.debug('Found reference %s' %(reference))
    return reference
Beispiel #27
0
def episodios(item):
    logger.info("{0} - {1}".format(item.title, item.url))

    itemlist = []

    # Descarga la página
    data = scrapertools.cache_page(item.url)

    fanart = scrapertools.find_single_match(data, "background-image[^'\"]+['\"]([^'\"]+)")
    plot = scrapertools.find_single_match(data, "id=['\"]profile2['\"]>\s*(.*?)\s*</div>")

    logger.debug("fanart: {0}".format(fanart))
    logger.debug("plot: {0}".format(plot))


    episodes = re.findall("<tr.*?href=['\"](?P<url>[^'\"]+).+?>(?P<title>.+?)</a>.*?<td>(?P<flags>.*?)</td>", data, re.MULTILINE | re.DOTALL)
    for url, title, flags in episodes:
        idiomas = " ".join(["[{0}]".format(IDIOMAS.get(language, "OVOS")) for language in re.findall("banderas/([^\.]+)", flags, re.MULTILINE)])
        displayTitle = "{show} - {title} {languages}".format(show = item.show, title = title, languages = idiomas)
        logger.debug("Episode found {0}: {1}".format(displayTitle, urlparse.urljoin(HOST, url)))
        itemlist.append(item.clone(title=displayTitle, url=urlparse.urljoin(HOST, url),
                                   action="findvideos", plot=plot, fanart=fanart, language=idiomas,
                                   list_idiomas=list_idiomas, list_calidad=CALIDADES, context=filtertools.context))

    if len(itemlist) > 0 and filtertools.context:
        itemlist = filtertools.get_links(itemlist, item.channel)

    if config.get_library_support() and len(itemlist) > 0:
        itemlist.append(item.clone(title="Añadir esta serie a la biblioteca", action="add_serie_to_library", extra="episodios"))

    return itemlist
Beispiel #28
0
    def __search(self, titles, type, year, season=0, episode=False):
        try:
            years = [str(year), str(int(year) + 1), str(int(year) - 1)]
            years = ['&veroeffentlichung[]=%s' % i for i in years]

            query = self.search_link % (type, urllib.quote_plus(cleantitle.query(titles[0])))
            query += ''.join(years)
            query = urlparse.urljoin(self.base_link, query)

            t = [cleantitle.get(i) for i in set(titles) if i]

            r = self.__proceed_search(query)
            r = [i[0] for i in r if cleantitle.get(i[1]) in t and int(i[2]) == int(season)][0]

            url = source_utils.strip_domain(r)
            if episode:
                r = client.request(urlparse.urljoin(self.base_link, url))
                r = dom_parser.parse_dom(r, 'div', attrs={'class': 'season-list'})
                r = dom_parser.parse_dom(r, 'li')
                r = dom_parser.parse_dom(r, 'a', req='href')
                r = [i.attrs['href'] for i in r if i and int(i.content) == int(episode)][0]

                url = source_utils.strip_domain(r)
            return url
        except:
            return
Beispiel #29
0
    def check_page(self, page):

        self.marionette.navigate(urlparse.urljoin(self.server_prefix, page))
        try:
            self.marionette.find_element("id", 'complete')
        except NoSuchElementException:
            fullPageUrl = urlparse.urljoin(self.relPath, page)

            details = "%s: 1 failure encountered\n%s" % \
                      (fullPageUrl,
                       self.get_failure_summary(
                           fullPageUrl, "Waiting for Completion",
                           "Could not find the test complete indicator"))

            raise AssertionError(details)

        fail_node = self.marionette.find_element("css selector",
                                                 '.failures > em')
        if fail_node.text == "0":
            return

        # This may want to be in a more general place triggerable by an env
        # var some day if it ends up being something we need often:
        #
        # If you have browser-based unit tests which work when loaded manually
        # but not from marionette, uncomment the two lines below to break
        # on failing tests, so that the browsers won't be torn down, and you
        # can use the browser debugging facilities to see what's going on.
        #from ipdb import set_trace
        #set_trace()

        raise AssertionError(self.get_failure_details(page))
Beispiel #30
0
    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html)
            if match:
                option_url = urlparse.urljoin(self.base_url, match.group(1))
                html = self._http_get(option_url, cache_limit=.25)
                fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'})
                if fragment:
                    iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
                    if iframe_url:
                        html = self._http_get(iframe_url[0], cache_limit=.25)

                        seen_urls = {}
                        for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html):
                            stream_url, height = match.groups()
                            if stream_url not in seen_urls:
                                seen_urls[stream_url] = True
                                stream_url += '|User-Agent=%s' % (scraper_utils.get_ua())
                                host = self._get_direct_hostname(stream_url)
                                if host == 'gvideo':
                                    quality = scraper_utils.gv_get_quality(stream_url)
                                else:
                                    quality = scraper_utils.height_get_quality(height)
                                hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
                                hosters.append(hoster)
    
        return hosters
 def _getFullUrl(url):
     if self.cm.isValidUrl(url): return url
     else: return urljoin(baseUrl, url)
Beispiel #32
0
    def generate_context(self):
        """change the context"""

        # return the list of files to use
        files = self.get_files(self.path, exclude=[
            'pages',
        ])
        all_articles = []
        for f in files:
            content, metadata = read_file(f)

            # if no category is set, use the name of the path as a category
            if 'category' not in metadata.keys():

                if os.path.dirname(f) == self.path:
                    category = self.settings['DEFAULT_CATEGORY']
                else:
                    category = os.path.basename(os.path.dirname(f))

                if category != '':
                    metadata['category'] = unicode(category)

            if 'date' not in metadata.keys()\
                and self.settings['FALLBACK_ON_FS_DATE']:
                metadata['date'] = datetime.fromtimestamp(os.stat(f).st_ctime)

            article = Article(content,
                              metadata,
                              settings=self.settings,
                              filename=f)
            if not is_valid_content(article, f):
                continue

            add_to_url = u''
            if 'ARTICLE_PERMALINK_STRUCTURE' in self.settings:
                article_permalink_structure = self.settings[
                    'ARTICLE_PERMALINK_STRUCTURE']
                article_permalink_structure = article_permalink_structure.lstrip(
                    '/')

                # try to substitute any python datetime directive
                add_to_url = article.date.strftime(article_permalink_structure)
                # try to substitute any article metadata in rest file
                add_to_url = add_to_url % article.__dict__
                add_to_url = [slugify(i) for i in add_to_url.split('/')]
                add_to_url = os.path.join(*add_to_url)

            article.url = urlparse.urljoin(add_to_url, article.url)
            article.save_as = urlparse.urljoin(add_to_url, article.save_as)

            if article.status == "published":
                if hasattr(article, 'tags'):
                    for tag in article.tags:
                        self.tags[tag].append(article)
                all_articles.append(article)
            elif article.status == "draft":
                self.drafts.append(article)

        self.articles, self.translations = process_translations(all_articles)

        for article in self.articles:
            # only main articles are listed in categories, not translations
            self.categories[article.category].append(article)
            self.authors[article.author].append(article)

        # sort the articles by date
        self.articles.sort(key=attrgetter('date'), reverse=True)
        self.dates = list(self.articles)
        self.dates.sort(key=attrgetter('date'),
                        reverse=self.context['REVERSE_ARCHIVE_ORDER'])

        # create tag cloud
        tag_cloud = defaultdict(int)
        for article in self.articles:
            for tag in getattr(article, 'tags', []):
                tag_cloud[tag] += 1

        tag_cloud = sorted(tag_cloud.items(), key=itemgetter(1), reverse=True)
        tag_cloud = tag_cloud[:self.settings.get('TAG_CLOUD_MAX_ITEMS')]

        tags = map(itemgetter(1), tag_cloud)
        if tags:
            max_count = max(tags)
        steps = self.settings.get('TAG_CLOUD_STEPS')

        # calculate word sizes
        self.tag_cloud = [
            (tag,
             int(
                 math.floor(steps - (steps - 1) * math.log(count) /
                            (math.log(max_count) or 1))))
            for tag, count in tag_cloud
        ]
        # put words in chaos
        random.shuffle(self.tag_cloud)

        # and generate the output :)

        # order the categories per name
        self.categories = list(self.categories.items())
        self.categories.sort(
            reverse=self.settings.get('REVERSE_CATEGORY_ORDER'))

        self.authors = list(self.authors.items())
        self.authors.sort()

        self._update_context(('articles', 'dates', 'tags', 'categories',
                              'tag_cloud', 'authors'))
Beispiel #33
0
 def get_url(self, cmd, **args):
     cmd_path = cmd if not args else cmd + '?{0}'.format(urlencode(args))
     return self.location(urljoin(Importer.api_base, cmd_path))[0]
Beispiel #34
0
def redirect(url):
    newloc = urlparse.urljoin(context.home + context.path, url)
    context.status = '301 Moved Permanently'
    header('Content-Type', 'text/html')
    header('Location', newloc)
 def get(self, cdnRelativePath, relative=True):
     url = urlparse.urljoin(self._cdnRootUrl, cdnRelativePath)
     return super(CdnResourcesCache, self).get(url, relative=relative)
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if debrid.status() == False: raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']

            hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year']

            query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year'])
            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            url = self.search_link % urllib.quote_plus(query)
            url = urlparse.urljoin(self.base_link, url)

            scraper = cfscrape.create_scraper()
            r = scraper.get(url).content
            posts = client.parseDOM(r, 'item')

            hostDict = hostprDict + hostDict
            print posts
            items = []
            
            for post in posts:
                try:
                    print post
                    items += zip(client.parseDOM(post, 'title'), client.parseDOM(post, 'link'))
                except:
                    pass
           
            items = [(i[0], i[1]) for i in items if data['year'] in i[0]]        
            print items[:1]
            for item in items:
                try:
                    name = item[0]
                    name = client.replaceHTMLCodes(name)

                    t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name)

                    if not cleantitle.get(t) == cleantitle.get(title): raise Exception()

                    y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper()

                    if not y == hdlr: raise Exception()

                    fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper())
                    fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt)
                    fmt = [i.lower() for i in fmt]

                    if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception()
                    if any(i in ['extras'] for i in fmt): raise Exception()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'
                    else: quality = 'SD'
                    if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR'
                    elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM'

                    info = []

                    if '3d' in fmt: info.append('3D')

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', name)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        size = '%.2f GB' % size
                        info.append(size)
                    except:
                        pass

                    if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC')

                    info = ' | '.join(info)

                    url = item[1]
                    if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception()
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                    if not host in hostDict: raise Exception()
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            return sources
        except:
            return sources
Beispiel #37
0
def main():
    
    print 'sending requests to %s' % BASE_URL 

    # DELETE map

    map_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes')
    headers = {'Authorization': 'Bearer %s' % TOKEN1}
    r = requests.delete(map_url, headers=headers)
    if r.status_code == 200:
        print 'deleted test map %s etag: %s' % (r.headers['Content-Location'], r.headers['etag'])
    elif r.status_code == 404:
        print 'test map was not there %s' % (map_url)
    else:
        print 'failed to delete map %s %s %s' % (map_url, r.status_code, r.text)
        return

    # Make sure the permissions exist for the test Org

    org_url = '/v1/o/ayesha'

    permissions = {
        '_subject': org_url,
        '_permissions': {
          'read': [USER1],
          'update': [USER1],
          'delete': [USER1]
        },
        '_self': {
          'read': [USER1],
          'delete': [USER1],
          'update': [USER1],
          'create': [USER1]
        },
        'maps': {
          'read': [USER1],
          'delete': [USER1],
          'create': [USER1]
        }
      }

    permissons_url = urljoin(BASE_URL, '/permissions')
    headers = {'Authorization': 'Bearer %s' % TOKEN1, 'Content-Type': 'application/json'}
    r = requests.post(permissons_url, headers=headers, json=permissions)
    if r.status_code == 201:
        print 'correctly created permissions for org %s etag: %s' % (r.headers['Location'], r.headers['etag'])
    elif r.status_code == 409:
        print 'correctly saw that permissions for org %s already exist' % (org_url)    
    else:
        print 'failed to create map %s %s %s' % (maps_url, r.status_code, r.text)
        return

    # Create map using POST

    map = {
        'isA': 'Map',
        'org': '/v1/o/ayesha',
        'name': 'nursery-rhymes',
        'test-data': True
        }

    maps_url = urljoin(BASE_URL, '/maps') 
    
    headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.post(maps_url, headers=headers, json=map)
    if r.status_code == 201:
        print 'correctly created map %s etag: %s' % (r.headers['Location'], r.headers['etag'])
        map_url = urljoin(BASE_URL, r.headers['Location'])
        print 'text:', type(r.text)
        map_entries = urljoin(BASE_URL, r.json()['entries'])
    else:
        print 'failed to create map %s %s %s' % (maps_url, r.status_code, r.text)
        return
        
    # GET Map

    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(map_url, headers=headers, json=map)
    if r.status_code == 200:
        map_url2 = urljoin(BASE_URL, r.headers['Content-Location'])
        if map_url == map_url2:
            map = r.json()
            print 'correctly retrieved map: %s etag: %s' % (map_url, r.headers['etag'])
        else:
            print 'retrieved map at %s but Content-Location is wrong: %s' % (map_url, map_url2)
            return
    else:
        print 'failed to retrieve map %s %s %s' % (map_url, r.status_code, r.text)
        return
        
    # POST entry for Humpty Dumpty

    entry = {
        'isA': 'MapEntry',
        'key': 'HumptyDumpty',
        'test-data': True
        }

    entries_url = urljoin(BASE_URL, map['entries'])   
    headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.post(entries_url, headers=headers, json=entry)
    if r.status_code == 201:
        entry_url = urljoin(BASE_URL, r.headers['Location'])
        value_ref  = urljoin(BASE_URL, r.json()['value'])
        print 'correctly created entry: %s value: %s map: %s etag: %s' % (entry_url, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag'])
    else:
        print 'failed to create entry %s %s %s' % (entries_url, r.status_code, r.text)
        return

    # GET entry for Humpty Dumpty

    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(entry_url, headers=headers)
    if r.status_code == 200:
        value_ref  = urljoin(BASE_URL, r.json()['value'])
        print 'correctly retrieved entry: %s value: %s map: %s etag: %s' % (entry_url, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag'])
    else:
        print 'failed to retrieve entry %s %s %s' % (entry_url, r.status_code, r.text)
        return

    # PUT value for HumptyDumpty

    headers = {'Content-Type': 'text/plain','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.put(value_ref, headers=headers, data='Humpty Dumpty Sat on a wall')
    if r.status_code == 200:
        loc = r.headers['Content-Location']
        print 'correctly created value: %s etag: %s' % (loc, r.headers['etag'])
        value_url = urljoin(BASE_URL, r.headers['Content-Location'])
    else:
        print 'failed to create value %s %s %s' % (value_ref, r.status_code, r.text)
        return
        
    # PUT value for LittleMissMuffet

    headers = {'Content-Type': 'text/plain','Authorization': 'Bearer %s' % TOKEN1}
    value_ref2 = '%s/entries;%s/value' % (map_url, 'LittleMissMuffet')
    r = requests.put(value_ref2, headers=headers, data='Little Miss Muffet\nSat on a tuffet')
    if r.status_code == 200:
        loc = r.headers['Content-Location']
        print 'correctly created value: %s etag: %s' % (loc, r.headers['etag'])
        value_url = urljoin(BASE_URL, loc)
    else:
        print 'failed to create value %s %s %s' % (value_ref2, r.status_code, r.text)
        return

    # GET entry for LittleMissMuffet

    entry_ref2 = '%s/entries;%s' % (map_url, 'LittleMissMuffet')
    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(entry_ref2, headers=headers)
    if r.status_code == 200:
        value_ref  = urljoin(BASE_URL, r.json()['value'])
        assert(value_ref == value_url)
        print 'correctly retrieved entry: %s value: %s map: %s etag: %s' % (entry_ref2, value_ref, urljoin(BASE_URL, r.json()['map']), r.headers['etag'])
    else:
        print 'failed to retrieve entry %s %s %s' % (entry_ref2, r.status_code, r.text)
        return

    # GET value for LittleMissMuffet

    headers = {'Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(value_ref2, headers=headers)
    if r.status_code == 200:
        loc = r.headers['Content-Location']
        print 'correctly got value at %s length: %s etag: %s text: %s' % (loc, len(r.text), r.headers['etag'], r.text)
    else:
        print 'failed to get value %s %s %s' % (value_ref2, r.status_code, r.text)
        return

    # GET all entries for map

    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(map_entries, headers=headers, json=map)
    if r.status_code == 200:
        print 'correctly retrieved map entries: %s' % map_url
    else:
        print 'failed to retrieve map entries %s %s %s' % (map_url, r.status_code, r.text)
        return

    # GET map by name

    name_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes')
    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(name_url, headers=headers, json=map)
    if r.status_code == 200:
        print 'correctly retrieved map by name: %s etag: %s' % (name_url, r.headers['etag']) 
    else:
        print 'failed to retrieve map by name %s %s %s' % (name_url, r.status_code, r.text)
        return

    map = {
        'isA': 'Map',
        'name': 'nursery-rhymes',
        'org': '/v1/o/ayesha',
        'test-data': True
        }

    # Create map with duplicate name

    headers = {'Content-Type': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.post(maps_url, headers=headers, json=map)
    if r.status_code == 409:
        print 'correctly refused to create map with duplicate name %s' % (r.text)
    else:
        print 'failed to reject map with duplicate name %s %s %s' % (maps_url, r.status_code, r.text)
        return

    # GET entries by map name

    entries_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries')
    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(entries_url, headers=headers, json=map)
    if r.status_code == 200:
        entries = r.json()
        if 'contents' in entries and isinstance(entries['contents'], list):
            print 'correctly retrieved map entries by name: %s' % (r.headers['Content-Location'])
        else:
            print 'wrong return type for map entries by name: %s type: %s' % (r.headers['Content-Location'], type(entries['contents']))
    else:
        print 'failed to retrieve map entries by name %s %s %s' % (entries_url, r.status_code, r.text)
        return

    # GET entry by map name and key

    entry_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries;HumptyDumpty')
    headers = {'Accept': 'application/json','Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(entry_url, headers=headers, json=map)
    if r.status_code == 200:
        print 'correctly retrieved map entry by name from map by name: %s returned: %s' % (entry_url, r.headers['Content-Location'])
    else:
        print 'failed to retrieve map entry by name from map by name %s %s %s' % (entry_url, r.status_code, r.text)
        return

    # GET value by map name and key

    value_url = urljoin(BASE_URL, '/maps;ayesha:nursery-rhymes/entries;HumptyDumpty/value')
    headers = {'Authorization': 'Bearer %s' % TOKEN1}
    r = requests.get(value_url, headers=headers, json=map)
    if r.status_code == 200:
        print 'correctly retrieved value from map entry by name: %s at: %s text: %s' % (value_url, r.headers['Content-Location'], r.text)
    else:
        print 'failed to retrieve value from map entry by name %s %s %s' % (value_url, r.status_code, r.text)
        return

    # DELETE map

    headers = {'Authorization': 'Bearer %s' % TOKEN1}
    r = requests.delete(map_url, headers=headers)
    if r.status_code == 200:
        print 'correctly deleted map %s etag: %s' % (r.headers['Content-Location'], r.headers['etag'])
    else:
        print 'failed to delete map %s %s %s' % (maps_url, r.status_code, r.text)
        return
 def getTokenEndpoint(self):
     if not self.url:
         raise Exception("oauth url error", self.url)
     
     return urljoin(self.url, self.OAUTH2_PATH['token'])
    def sources(self, url, hostDict, hostprDict):
        sources = []
        try:
            if url == None:
                raise Exception()

            if not (self.api and not self.api == ''):
                raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']
            year = int(data['year']) if 'year' in data and not data['year'] == None else None
            season = int(data['season']) if 'season' in data and not data['season'] == None else None
            episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None
            query = '%s S%02dE%02d' % (title, season, episode) if 'tvshowtitle' in data else '%s %d' % (title, year)

            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            query += ' lang:%s' % self.language[0]
            query = urllib.quote_plus(query)
            url = urlparse.urljoin(self.base_link, self.search_link)

            hostDict = hostprDict + hostDict

            iterations = self.streamLimit/self.streamIncrease
            last = self.streamLimit - (iterations * self.streamIncrease)
            if not last:
                iterations = iterations - 1
                last = self.streamIncrease
            iterations = iterations + 1

            seen_urls = set()
            for type in self.types:
                searchFrom = 0
                searchCount = self.streamIncrease
                for offset in range(iterations):
                    if iterations == offset + 1: searchCount = last
                    urlNew = url % (type, self.api, query, searchCount, searchFrom)
                    searchFrom = searchFrom + self.streamIncrease

                    results = client.request(urlNew)
                    results = json.loads(results)

                    apistatus  = results['status']
                    if apistatus != 'success': break

                    results = results['result']

                    added = False
                    for result in results:
                        jsonName = result['title']
                        jsonSize = result['sizeinternal']
                        jsonExtension = result['extension']
                        jsonLanguage = result['lang']
                        jsonHoster = result['hostername'].lower()
                        jsonLink = result['hosterurls'][0]['url']

                        if jsonLink in seen_urls: continue
                        seen_urls.add(jsonLink)

                        if not jsonHoster in hostDict: continue

                        if not self.extensionValid(jsonExtension): continue

                        quality, info = source_utils.get_release_quality(jsonName)
                        info.append(self.formatSize(jsonSize))
                        info.append(jsonName)
                        info = '|'.join(info)

                        sources.append({'source' : jsonHoster, 'quality':  quality, 'language' : jsonLanguage, 'url' : jsonLink, 'info': info, 'direct' : False, 'debridonly' : False})
                        added = True

                    if not added:
                        break

            return sources
        except:
            return sources
Beispiel #40
0
 def url(self, filename):
     return urlparse.urljoin(self.base_url, filename).replace('\\', '/')
        scrape_and_look_for_next_link(next_url)
    if 4050 < i < 4500:
        next_url = ListofOKCases[i]
        print next_url
        record = {}
        record['URL'] = next_url
        scraperwiki.sqlite.save(['URL'], record)
        scrape_and_look_for_next_link(next_url)


# ---------------------------------------------------------------------------
# START HERE: define your starting URL - then
# call a function to scrape the first page in the series.
# ---------------------------------------------------------------------------
base_url = 'http://www.oscn.net/dockets/'
starting_url = urlparse.urljoin(
    base_url, 'GetCaseInformation.aspx?db=garfield&number=CF-2011-1')
print starting_url
global i
i = 1
#for i in range(0,1):
#There are 743 cases but 468 appears to be the server request limit
CaseEndingNumbers()
ListOfCaseEndingNumbers = list(CaseEndingNumbers())
GetOklahomaStateCases()
ListofOKCases = list(GetOklahomaStateCases())
scrape_and_look_for_next_link(starting_url)

# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
 def getAuthorizeEndpoint(self):
     if not self.url:
         raise Exception("oauth url error", self.url)
     
     return urljoin(self.url, self.OAUTH2_PATH['authorize'])
Beispiel #43
0
 def relative_path_to_absolute_uri(self, relative_path):
     """Return an aboslute URI given a relative path taking into account the test context."""
     return urlparse.urljoin(BASE_URL, relative_path)
Beispiel #44
0
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if debrid.status() == False: raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']

            hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year']

            query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year'])
            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            s = client.request(self.base_link)
            s = re.findall('\'(http.+?)\'', s) + re.findall('\"(http.+?)\"', s)
            s = [i for i in s if urlparse.urlparse(self.base_link).netloc in i and len(i.strip('/').split('/')) > 3]
            s = s[0] if s else urlparse.urljoin(self.base_link, 'posts')
            s = s.strip('/')

            url = s + self.search_link % urllib.quote_plus(query)

            r = client.request(url)

            r = client.parseDOM(r, 'h2', attrs = {'class': 'post-title'})
            r = zip(client.parseDOM(r, 'a', ret='href'), client.parseDOM(r, 'a', ret='title'))
            r = [(i[0], i[1], re.sub('(\.|\(|\[|\s)(\d{4}|3D)(\.|\)|\]|\s|)(.+|)', '', i[1]), re.findall('[\.|\(|\[|\s](\d{4}|)([\.|\)|\]|\s|].+)', i[1])) for i in r]
            r = [(i[0], i[1], i[2], i[3][0][0], i[3][0][1]) for i in r if i[3]]
            r = [(i[0], i[1], i[2], i[3], re.split('\.|\(|\)|\[|\]|\s|\-', i[4])) for i in r]
            r = [i for i in r if cleantitle.get(title) == cleantitle.get(i[2]) and data['year'] == i[3]]
            r = [i for i in r if not any(x in i[4] for x in ['HDCAM', 'CAM', 'DVDR', 'DVDRip', 'DVDSCR', 'HDTS', 'TS', '3D'])]
            r = [i for i in r if '1080p' in i[4]][:1] + [i for i in r if '720p' in i[4]][:1]

            posts = [(i[1], i[0]) for i in r]

            hostDict = hostprDict + hostDict

            items = []

            for post in posts:
                try:
                    t = post[0]

                    u = client.request(post[1])
                    u = re.findall('\'(http.+?)\'', u) + re.findall('\"(http.+?)\"', u)
                    u = [i for i in u if not '/embed/' in i]
                    u = [i for i in u if not 'youtube' in i]

                    items += [(t, i) for i in u]
                except:
                    pass

            for item in items:
                try:
                    name = item[0]
                    name = client.replaceHTMLCodes(name)

                    t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name)

                    if not cleantitle.get(t) == cleantitle.get(title): raise Exception()

                    y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper()

                    if not y == hdlr: raise Exception()

                    fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper())
                    fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt)
                    fmt = [i.lower() for i in fmt]

                    if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception()
                    if any(i in ['extras'] for i in fmt): raise Exception()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'
                    else: quality = 'SD'
                    if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR'
                    elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM'

                    info = []

                    if '3d' in fmt: info.append('3D')

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', item[2])[-1]
                        div = 1 if size.endswith(('GB', 'GiB')) else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        size = '%.2f GB' % size
                        info.append(size)
                    except:
                        pass

                    if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC')

                    info = ' | '.join(info)

                    url = item[1]
                    if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception()
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                    if not host in hostDict: raise Exception()
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            check = [i for i in sources if not i['quality'] == 'CAM']
            if check: sources = check

            return sources
        except:
            return sources
Beispiel #45
0
def videolist(params, url, category):
    logger.info("[veocine.py] mainlist")

    # ------------------------------------------------------
    # Descarga la página
    # ------------------------------------------------------
    data = scrapertools.cachePage(url)
    #logger.info(data)

    # ------------------------------------------------------
    # Extrae las películas
    # ------------------------------------------------------
    patron = '<tr.*?'
    patron += '<td.*?'
    patron += '<a href="([^"]+)">'
    patron += "<img src='([^']+)'.*?<a.*?>\s*(.*?)\s*<(.*?)"
    patron += "<img .*? alt='([^']+)' />"
    matches = re.compile(patron, re.DOTALL).findall(data)
    if DEBUG:
        scrapertools.printMatches(matches)

    for match in matches:
        try:
            scrapedtitle = unicode(
                match[2], "utf-8").encode("iso-8859-1") + " (" + match[4] + ")"
        except:
            scrapedtitle = match[2] + " (" + match[4] + ")"
        scrapedurl = urlparse.urljoin("http://www.veocine.es/", match[0])
        scrapedthumbnail = ""

        try:
            scrapedplot = unicode(match[3], "utf-8").encode("iso-8859-1")
        except:
            scrapedplot = match[3]

        scrapedplot = scrapedplot.replace("/a>", "\n")
        scrapedplot = scrapedplot.replace("<br />", "\n")
        scrapedplot = scrapedplot.replace("<b>", "")
        scrapedplot = scrapedplot.replace("</b>", "")
        scrapedplot = scrapedplot.replace("<i>", "")
        scrapedplot = scrapedplot.replace("</i>", "")
        scrapedplot = scrapedplot.replace("<!--colorstart:#589BB9-->", "")
        scrapedplot = scrapedplot.replace("<!--colorend-->", "")
        scrapedplot = scrapedplot.replace("<!--/colorend-->", "")
        scrapedplot = scrapedplot.replace("<!--/colorstart-->", "")
        scrapedplot = scrapedplot.replace('<span style="color:#589BB9">', "")
        scrapedplot = scrapedplot.replace("</span>", "")
        scrapedplot = scrapedplot.strip()

        # Depuracion
        if DEBUG:
            logger.info("scrapedtitle=" + scrapedtitle)
            logger.info("scrapedurl=" + scrapedurl)
            logger.info("scrapedthumbnail=" + scrapedthumbnail)
            logger.info("scrapedplot=" + scrapedplot)

        # Añade al listado de XBMC
        xbmctools.addnewfolder(__channel__, "listmirrors", category,
                               scrapedtitle, scrapedurl, scrapedthumbnail,
                               scrapedplot)

    # ------------------------------------------------------
    # Extrae la página siguiente
    # ------------------------------------------------------
    patron = "<a href='([^']+)'>Siguiente</a>"
    matches = re.compile(patron, re.DOTALL).findall(data)
    if DEBUG:
        scrapertools.printMatches(matches)

    for match in matches:
        scrapedtitle = "Pagina siguiente"
        scrapedurl = urlparse.urljoin("http://www.veocine.es/", match)
        scrapedthumbnail = ""
        scrapeddescription = ""

        # Depuracion
        if DEBUG:
            logger.info("scrapedtitle=" + scrapedtitle)
            logger.info("scrapedurl=" + scrapedurl)
            logger.info("scrapedthumbnail=" + scrapedthumbnail)

        # Añade al listado de XBMC
        xbmctools.addthumbnailfolder(__channel__, scrapedtitle, scrapedurl,
                                     scrapedthumbnail, "mainlist")

    # Label (top-right)...
    xbmcplugin.setPluginCategory(handle=int(sys.argv[1]), category=category)

    # Disable sorting...
    xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                             sortMethod=xbmcplugin.SORT_METHOD_NONE)

    # End of directory...
    xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=True)
    def sources(self, url, hostDict):
        sources = []
        if not url: return sources
        try:
            scraper = cfscrape.create_scraper(delay=5)
            data = parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '')
                         for i in data])
            title = data['tvshowtitle'] if 'tvshowtitle' in data else data[
                'title']
            title = title.replace('&', 'and').replace('Special Victims Unit',
                                                      'SVU')
            aliases = data['aliases']
            episode_title = data['title'] if 'tvshowtitle' in data else None
            year = data['year']
            hdlr = 'S%02dE%02d' % (int(data['season']), int(
                data['episode'])) if 'tvshowtitle' in data else year

            query = '%s %s' % (title, hdlr)
            query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query)
            url = self.search_link % quote_plus(query)
            url = urljoin(self.base_link, url).replace('%3A+', '+')
            # log_utils.log('url = %s' % url, log_utils.LOGDEBUG)
            # result = scraper.get(url).content
            result = py_tools.ensure_str(scraper.get(url).content,
                                         errors='replace')

            if not result or "Sorry, but you are looking for something that isn't here" in str(
                    result):
                return sources
            posts = client.parseDOM(result, "div", attrs={"class": "post"})
            if not posts: return sources
        except:
            source_utils.scraper_error('MAXRLS')
            return sources
        for post in posts:
            try:
                post_title = client.parseDOM(post,
                                             "h2",
                                             attrs={"class": "postTitle"})
                post_title = client.parseDOM(post_title, 'a')[0]
                if not source_utils.check_title(title, aliases, post_title,
                                                hdlr, year):
                    continue
                content = client.parseDOM(post,
                                          "div",
                                          attrs={"class": "postContent"})
                ltr = client.parseDOM(content, "p", attrs={"dir": "ltr"})
                if not ltr: continue

                for i in ltr:
                    if '<strong>' not in i or 'imdb.com' in i: continue
                    name = re.search(r'<strong>(.*?)<', i).group(1)
                    name = re.sub(r'(<span.*?>)', '',
                                  name).replace('</span>', '')
                    if title not in name:
                        continue  # IMDB and Links: can be in name so check for title match
                    name_info = source_utils.info_from_name(
                        name, title, year, hdlr, episode_title)
                    if source_utils.remove_lang(name_info): continue

                    links = client.parseDOM(i, "a", ret="href")
                    size = re.findall(
                        r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))',
                        i, re.DOTALL)

                    for link in links:
                        url = link
                        if url in str(sources): continue
                        valid, host = source_utils.is_host_valid(url, hostDict)
                        if not valid: continue

                        quality, info = source_utils.get_release_quality(
                            name_info, url)
                        try:
                            dsize, isize = source_utils._size(size[0])
                            info.insert(0, isize)
                        except:
                            dsize = 0
                        info = ' | '.join(info)

                        sources.append({
                            'provider': 'maxrls',
                            'source': host,
                            'name': name,
                            'name_info': name_info,
                            'quality': quality,
                            'language': 'en',
                            'url': url,
                            'info': info,
                            'direct': False,
                            'debridonly': True,
                            'size': dsize
                        })
            except:
                source_utils.scraper_error('MAXRLS')
        return sources
    def sources(self, url, hostDict, hostprDict):
        try:    
            sources = []
            
            if url == None: return sources
      
            data = urlparse.parse_qs(url)

            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']
            categ = 'tv-shows' if 'tvshowtitle' in data else 'movies'
            hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year']
            query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year'])
            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            url = self.search_link % (urllib.quote_plus(query), categ)
            url = urlparse.urljoin(self.base_link, url)

            headers = {'User-Agent': self.useragent}
            r = self.scraper.get(url, headers=headers).content

            r = re.findall('<item>\s*<title>([^<>]+)<\/title>\s*<link>([^<>]+)<\/link>', r)
            if len(r) == 0: raise Exception()

            hostDict = hostprDict + hostDict

            items = []

            for item in r:
                try:
                    t = item[0].rsplit('&#038;', 1)[0]

                    if any(x in t.lower() for x in ['.bonus.', '.extra.', '.extras.']): raise Exception()

                    t = re.sub('(\[.*?\])|(<.+?>)', '', t)
                    t1 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d+|3D)(\.|\)|\]|\s|)(.+|)', '', t)

                    if not cleantitle.get(t1) == cleantitle.get(title): raise Exception()

                    y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', t)[-1].upper()

                    if not y == hdlr: raise Exception()

                    headers = {'User-Agent': self.useragent}
                    data = self.scraper.get(item[1], headers=headers).content

                    data = client.parseDOM(data, 'div', attrs={'class': 'cont.+?'})[0]
                    data = dom_parser2.parse_dom(data, 'a', req='href')

                    u = [(t, i.attrs['href']) for i in data]  
                    items += u

                except:
                    pass
					
            for item in items:
                try:
                    name = item[0]
                    name = client.replaceHTMLCodes(name)

                    quality, info = source_utils.get_release_quality(name, item[1])

                    url = item[1]

                    if not url.startswith('http'): continue
                    if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception()
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    valid, host = source_utils.is_host_valid(url, hostDict)
                    if not valid: continue
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': ' | '.join(info), 'direct': False, 'debridonly': True})
                except:
                    pass

            return sources
        except:
            log_utils.log('>>>> %s TRACE <<<<\n%s' % (__file__.upper().split('\\')[-1].split('.')[0], traceback.format_exc()), log_utils.LOGDEBUG)
            return sources
 def parse(self, response):
     for product_url in response.xpath('//div[contains(@class, "products")]//a/@href').extract():
         yield scrapy.Request(urlparse.urljoin(response.url, product_url),
                              callback=self.__parse_product_page)
Beispiel #49
0
from flask import Flask, request
import os
import os.path
import re
import shelve
import subprocess
import threading
import time
import urlparse

## FIXME:  This should go into a config file.
REPO_FS_BASE = '/srv/release/repository/release'
REPO_HTTP_BASE = 'http://packages.release.eucalyptus-systems.com/'
YUM_BASE = 'yum/builds/'
RPM_FS_BASE = os.path.join(REPO_FS_BASE, YUM_BASE)
RPM_HTTP_BASE = urlparse.urljoin(REPO_HTTP_BASE, YUM_BASE)
RESULT_CACHE_FILENAME = '/var/lib/genrepo/result-cache'

# A python shelf object:  the lazy man's key-value store
RESULT_CACHE = None
RESULT_CACHE_LOCK = threading.Lock()

app = Flask(__name__)


@app.route('/api/1/genrepo/', methods=['GET', 'POST'])
def do_genrepo():
    if request.method == 'POST':
        params = request.form
    elif request.method == 'GET':
        params = request.args
Beispiel #50
0
    def parse_hansard_post_1998(self, response):
        sel = Selector(response)    

        # Get the year that this index page is for
        # Meetings (Year 2013 - 2014)
        # This is mostly for debugging purposes so we can spit this out in the logs
        year_range = sel.xpath('//strong/em/text()').extract()
        if not year_range:
            self.log("%s: Could not find year range on hansard index page" % response.url, level=log.WARNING)
            return
        else:
            self.log("%s: Parsing Hansard Index: %s" % (response.url, year_range), level=log.INFO)

        # Find any dates at the top of this page. Other dates are identical
        # to this page, and indeed the current page will also be included in
        # the date list. Scrapy will prevent us recursing back into ourselves.
    
        year_urls = sel.xpath('//tr/td/a[contains(@href,"#toptbl")]/@href').extract()
        for year_url in year_urls:
            absolute_url = urlparse.urljoin(response.url, year_url.strip())
            req = Request(absolute_url, callback = self.parse_hansard_index_page)
            yield req
        
        # We are looking for table rows which link to Hansard entries for a
        # particular date. In newer versions these are 6-columned table rows
        # where column 6 is a link to a webcast (doesn't seem to exist)
        # Older revisions are 5 columned rows. These are all after the anchor
        # 'hansard'.

        print "Parsing Rows"
        # Find the handsard table
        table = sel.xpath("//div[@class='table_overflow']//a[@name='hansard']/following::table[1]")
        if not table:
            # http://www.legco.gov.hk/general/english/counmtg/yr08-12/mtg_0910.htm
            table = sel.xpath("//div[@id='_content_']//a[@name='hansard']/following::table[1]")

        rows = table.xpath(".//tr[count(td)>=5]")
        if not rows:
            self.log("%s: Could not find any Handard entries to crawl into" % response.url, level=log.WARNING)
            return
    
        self.log("%s: %i rows found" % (response.url, len(rows)), level=log.INFO)

        for row in rows:
            date_info = ' '.join(row.xpath('.//td[1]/node()/text()').extract())
            self.log("%s: Row: %s" % (response.url, date_info), level=log.INFO)

            # Recurse into the agenda, if it exists
            agenda_url = row.xpath('.//td[2]/a/@href').extract()
            if agenda_url:
                absolute_url = urlparse.urljoin(response.url, agenda_url[0].strip())
                req = Request(absolute_url, callback = self.parse_hansard_agenda)
                yield req
            else:
                self.log("%s: Could not find an agenda URL for %s" % (response.url, date_info), level=log.WARNING)
        
            # Download the minutes document if it exists. This is a PDF file
            minutes_url = row.xpath('.//td[3]/a/@href').extract()
            if minutes_url:
                absolute_url = urlparse.urljoin(response.url, minutes_url[0].strip())
                minutes = HansardMinutes()
                minutes['date'] = date_info
                minutes['file_urls'] = [absolute_url]
                yield minutes
            else:
                self.log("%s: Could not find an minutes URL for %s" % (response.url, date_info), level=log.WARNING)

            for (lang, index) in [('en',4),('cn',5)]:

                hansard_urls = row.xpath('.//td[%i]/a/@href' % index).extract()
                for url in hansard_urls:
                    # Is this a PDF entry, or do we need to recurse?
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    if absolute_url.endswith('pdf'):
                        hansard_record = HansardRecord()
                        hansard_record['date'] = date_info
                        hansard_record['language'] = lang
                        hansard_record["file_urls"] = [absolute_url]
                        yield hansard_record
                    else:
                        # Recurse into the HTML handler for the HTML Handard Record Index
                        req = Request(absolute_url, callback = self.parse_hansard_html_record)
                        yield req

                if not hansard_urls:
                    self.log("%s: Could not find an hansard URL for %s, lang %s" % (response.url, date_info, lang), level=log.WARNING)
Beispiel #51
0
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if debrid.status() == False: raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']

            hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year']

            query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year'])
            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            url = self.search_link % urllib.quote_plus(query)
            url = urlparse.urljoin(self.base_link, url)

            r = client.request(url)

            posts = client.parseDOM(r, 'item')

            hostDict = hostprDict + hostDict

            items = []

            for post in posts:
                try:
                    t = client.parseDOM(post, 'title')[0]

                    c = client.parseDOM(post, 'content.+?')[0]

                    u = re.findall('>Single Link(.+?)p>\s*<span', c.replace('\n', ''))[0]

                    u = client.parseDOM(u, 'a', ret='href')

                    s = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', c)
                    s = s[0] if s else '0'

                    items += [(t, i, s) for i in u]
                except:
                    pass

            for item in items:
                try:
                    name = item[0]
                    name = client.replaceHTMLCodes(name)

                    t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name)

                    if not cleantitle.get(t) == cleantitle.get(title): raise Exception()

                    y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper()

                    if not y == hdlr: raise Exception()
                    quality, info = source_utils.get_release_quality(name, item[1])

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', item[2])[-1]
                        div = 1 if size.endswith(('GB', 'GiB')) else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        size = '%.2f GB' % size
                        info.append(size)
                    except:
                        pass

                    info = ' | '.join(info)

                    url = item[1]
                    if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception()
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    valid, host = source_utils.is_host_valid(url,hostDict)
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            check = [i for i in sources if not i['quality'] == 'CAM']
            if check: sources = check

            return sources
        except:
            return sources
Beispiel #52
0
def generate_deb_repo(distro, release, arch, url, commit):
    if (distro, release) not in (('ubuntu', 'lucid'), ('ubuntu', 'precise'),
                                 ('debian', 'sid')):
        return 'Error: invalid release:  %s %s' % (distro, release), 400
    if url.endswith("eucalyptus"):
        package_name = "eucalyptus"
    elif url.endswith("internal"):
        package_name = "eucalyptus-enterprise"
    else:
        return ('Error: Invalid url.  Please end your URL with "eucalyptus" '
                'or "internal"'), 400

    # Truncate to 6 characters
    commit = commit[:6]

    # Locate debs
    pool = os.path.join(REPO_FS_BASE, distro, 'pool/main/e', package_name)
    pool_contents = os.listdir(pool)
    current_high_ver = "0"
    counter = 0
    for euca_file in pool_contents:
        if (commit in euca_file and euca_file.endswith('.deb')
                and release in euca_file):
            # Now determine the newest one
            fields = euca_file.split("_")
            euca_file_ver = fields[1]
            if apt.VersionCompare(euca_file_ver, current_high_ver) >= 1:
                current_high_ver = euca_file_ver
            counter += 1

    # eucalyptus has 10 binary packages (java-common may go away) and internal
    # has 4 + a dummy package if we have less than that, bail, as an invalid
    # hash has been detected
    if (package_name == 'eucalyptus' and counter < 9) or counter < 4:
        return ('Error: You have requested a commit that does not exist in '
                'this distro/release.'), 404

    # Generate the repository
    time.sleep(1)
    timestamp = str(int(time.time()))
    try:
        subprocess.check_call([
            'generate-eucalyptus-repository', distro, release,
            commit + '-' + timestamp
        ])
    except subprocess.CalledProcessError:
        return 'Error: failed to generate the repository', 500
    current_repo_name = release + "-" + commit + "-" + timestamp

    for euca_file in pool_contents:
        if (current_high_ver in euca_file and release in euca_file
                and euca_file.endswith('.deb')):
            try:
                subprocess.check_call([
                    'reprepro', '--keepunreferencedfiles', '-V', '-b',
                    os.path.join(REPO_FS_BASE, distro), 'includedeb',
                    current_repo_name,
                    os.path.join(pool, euca_file)
                ])
            except subprocess.CalledProcessError:
                return 'Error: failed to add DEBs to new repo', 500
    # Return the repo information
    return ' '.join(
        ('deb', urlparse.urljoin(REPO_HTTP_BASE,
                                 distro), current_repo_name, 'main')), 201
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if not str(url).startswith('http'):

                data = urlparse.parse_qs(url)
                data = dict([(i, data[i][0]) if data[i] else (i, '')
                             for i in data])

                if 'tvshowtitle' in data:
                    url = '%s/drama/%s/episode-%01d/' % (
                        self.base_link, cleantitle.geturl(
                            data['tvshowtitle']), int(data['episode']))
                else:
                    url = '%s/movie/%s/' % (self.base_link,
                                            cleantitle.geturl(data['title']))

                url = client.request(url, timeout='10', output='geturl')
                if url == None: raise Exception()

            else:
                url = urlparse.urljoin(self.base_link, url)
                r = client.request(url, timeout='10')

            r = client.request(url, timeout='10')
            links = client.parseDOM(r, 'iframe', ret='src')

            for link in links:
                if 'vidnow' in link:
                    r = client.request(link, timeout='10')
                    s = re.findall('window\.atob\(\"(.*?)\"\)', r)
                    r = re.findall('(https:.*?(openload|redirector).*?)[\'\"]',
                                   r)

                    for i in s:
                        i = base64.b64decode(i)
                        try:
                            sources.append({
                                'source':
                                'gvideo',
                                'quality':
                                directstream.googletag(i)[0]['quality'],
                                'language':
                                'ko',
                                'url':
                                i,
                                'direct':
                                True,
                                'debridonly':
                                False
                            })
                        except:
                            pass

                    for i in r:
                        if 'openload' in i:
                            try:
                                sources.append({
                                    'source': 'openload',
                                    'quality': 'SD',
                                    'language': 'ko',
                                    'url': i[0],
                                    'direct': False,
                                    'debridonly': False
                                })
                            except:
                                pass
                        elif 'google' in i:
                            try:
                                sources.append({
                                    'source':
                                    'gvideo',
                                    'quality':
                                    directstream.googletag(i)[0]['quality'],
                                    'language':
                                    'ko',
                                    'url':
                                    i[0],
                                    'direct':
                                    True,
                                    'debridonly':
                                    False
                                })
                            except:
                                pass
                        else:
                            pass
                else:
                    pass

            return sources
        except:
            return sources
Beispiel #54
0
        '[@None] Fetching language pack manifests from {0}'.format(list_url))

    if not list_url.startswith(settings.LANGPACK_DOWNLOAD_BASE):
        log.error('[@None] Not fetching language packs from invalid URL: '
                  '{0}'.format(base_url))
        raise ValueError('Invalid path')

    try:
        req = requests.get(list_url, verify=settings.CA_CERT_BUNDLE_PATH)
    except Exception, e:
        log.error('[@None] Error fetching language pack list {0}: {1}'.format(
            path, e))
        return

    xpi_list = [
        urljoin(list_base, line[-1])
        for line in map(str.split, req.iter_lines())
    ]

    allowed_file = re.compile(r'^[A-Za-z-]+\.xpi$').match

    for url in xpi_list:
        # Filter out files not in the target langpack directory.
        if not url.startswith(base_url):
            continue

        xpi = url[len(base_url):]
        # Filter out entries other than direct child XPIs.
        if not allowed_file(xpi):
            continue
    def parse1(self, response):
        # print response.body
        if '页面不存在' in response.body:
            Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())),
                      'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()),
                      's_ViewType': '10',
                      'PHOENIX_ID': '0a0102f1-15c114151d0-436b141'
                      }
            header['User-Agent'] = random.choice(ua_list)
            yield Request(response.url, errback=self.parse_failure,
                          callback=self.parse, headers=header, cookies=Cookie, dont_filter=True)
        else:
            sel = Selector(response)
            detail_list = sel.xpath('//div[@class="reviews-items"]/ul/li')
            if detail_list:
                for detail in detail_list:
                    item = DianpingcommentItem()
                    comment_id = ''.join(detail.xpath('./a/@data-user-id').extract())
                    item['comment_id'] = comment_id
                    shop_id = ''.join(re.findall('com/shop/(.*?)/review', response.url))
                    item['shop_id'] = shop_id
                    href = ''.join(
                        detail.xpath('./a/@href').extract()).strip().replace(
                        '\n', '')
                    name = ''.join(
                        detail.xpath('./div[@class="main-review"]/div/a[@class="name"]/text()').extract()).strip().replace(
                        '\n', '')
                    # print href
                    item['user_name'] = name
                    user_id = href.replace('/member/', '').strip().replace('\n', '')
                    item['user_id'] = user_id
                    total_score = ''.join(detail.xpath(
                        './div[@class="content"]/div[@class="user-info"]/span[1]/@class').extract()).strip().replace(
                        '\n',
                        '')
                    if not total_score:
                        total_score = ''.join(detail.xpath(
                            './div[@class="content"]/p[@class="shop-info"]/span[1]/@class').extract()).strip().replace(
                            '\n',
                            '')
                    total_score = total_score.replace('item-rank-rst irr-star', '')
                    if total_score:
                        total_score = int(total_score) / 10
                    item['total_score'] = total_score
                    scores = detail.xpath('./div[@class="content"]/div[@class="user-info"]/div/span/text()').extract()
                    if scores:
                        if len(scores) == 3:
                            score1 = scores[0]
                            score2 = scores[1]
                            score3 = scores[2]
                            score1_name = score1[:-1]
                            score1 = score1[-1:]
                            item['score1_name'] = score1_name
                            item['score1'] = score1

                            score2_name = score2[:-1]
                            score2 = score2[-1:]
                            item['score2_name'] = score2_name
                            item['score2'] = score2

                            score3_name = score3[:-1]
                            score3 = score3[-1:]
                            item['score3_name'] = score3_name
                            item['score3'] = score3
                    else:
                        item['score1_name'] = ''
                        item['score2_name'] = ''
                        item['score3_name'] = ''
                        item['score1'] = 0
                        item['score2'] = 0
                        item['score3'] = 0
                    comment_txt = ''.join(detail.xpath(
                        './div[@class="content"]/div[@class="comment-txt"]/div/text()').extract()).strip().replace('\n',
                                                                                                                   '')
                    item['comment_text'] = comment_txt
                    comment_dt = ''.join(detail.xpath(
                        './div[@class="content"]/div[@class="misc-info"]/span/text()').extract()).strip().replace('\n',
                                                                                                                  '')
                    if comment_dt:
                        comment_dt = comment_dt.replace(u'更新于', '')
                        comment_dt = comment_dt.replace('\n', '').replace('\r', '').replace('\t', '').strip()
                        comment_dt = comment_dt.split(u'\xa0')
                        if comment_dt:
                            comment_dt = comment_dt[0]

                        if len(comment_dt) == 5:
                            comment_dt = '2017-' + comment_dt
                        elif len(comment_dt) == 8:
                            comment_dt = '20' + comment_dt
                        if ' ' in comment_dt:
                            comment_dt = comment_dt.split(' ')[0]
                    else:
                        comment_dt = ''.join(detail.xpath(
                            './div[@class="content"]/div[@class="misc-info"]/span/a[@class="time"]/text()').extract()).strip().replace(
                            '\n',
                            '')

                    item['comment_dt'] = comment_dt
                    contribution = ''.join(
                        detail.xpath(
                            './div[@class="pic"]/p[@class="contribution"]/span/@title').extract()).strip().replace(
                        '\n', '')
                    contribution = contribution.replace('贡献值', '').strip()
                    item['user_contrib_val'] = contribution
                    # try:
                    #     db_insert.insert('t_hh_dianping_shop_comments', **item)
                    # except:
                    #     pass
                    yield item
                next_page = sel.xpath('//a[@class="NextPage"]/@href')
                if next_page:
                    next_page = ''.join(next_page.extract())
                    next_page = urljoin(response.url, next_page)
                    print next_page
                    Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())),
                              'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()),
                              's_ViewType': '10',
                              'PHOENIX_ID': '0a0102f1-15c114151d0-436b141'
                              }
                    header['User-Agent'] = random.choice(ua_list)
                    yield Request(next_page, errback=self.parse_failure,
                                  callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, )
            else:
                print response.body
Beispiel #56
0
def episodios(item):
    logger.info("tvalacarta.channels.eltrece episodios")

    itemlist = []
    '''
    <div  about="/la-noche-de-mirtha/programa-38_074529" typeof="sioc:Item foaf:Document" class="ds-1col node node--capitulo-completo view-mode-c13_capitulo_completo node--c13-capitulo-completo node--capitulo-completo--c13-capitulo-completo clearfix">
    <figure data-desktop="217x122" data-tabletlandscape="217x122" data-tabletportrait="217x122" data-mobilelandscape="217x122" data-mobileportrait="217x122" alt="Programa 38 (10-01-15)" data-width="90" data-height="90" data-timestamp="1421945563"  data-uri="public://2015/01/11/mirthascioli.jpg" class="field field--name-field-images field--type-image field--label-hidden" ><a href="/la-noche-de-mirtha/programa-38_074529" data-pagetype="capitulo_completo"><span class="hasvideo"></span><noscript><img src='public://styles/90x90/public/2015/01/11/mirthascioli.jpg?t=1421945563' width='90' height='90' alt='Programa 38 (10-01-15)' /></noscript></a><figcaption></figcaption></figure>
    <h2><a data-pagetype="capitulo_completo" href="/la-noche-de-mirtha/programa-38_074529">Programa 38 (10-01-15)</a></h2>
    <p>Invitados del programa de hoy: Daniel Scioli, Alejandra Maglietti, Facundo...</p></div>
    '''
    # Descarga la página
    data = scrapertools.cache_page(item.url)
    item.url = urlparse.urljoin(
        item.url,
        scrapertools.find_single_match(
            data, 'href="(/[^\/]+/capitulos-completos)">Cap'))

    # Busca la opción de "Capítulos completos"
    data = scrapertools.cache_page(item.url)
    matches = re.compile('<figure(.*?)</div>', re.DOTALL).findall(data)

    for match in matches:
        logger.info("tvalacarta.channels.eltrece programas match=" + match)
        title = scrapertools.find_single_match(
            match,
            '<a data-pagetype="capitulo_completo" href="[^"]+">([^<]+)</a>')

        if title == "":
            title = scrapertools.find_single_match(
                match, "<figcaption>([^<]+)</figcaption>")

        if title == "":
            title = scrapertools.find_single_match(match, 'alt="([^"]+)"')

        title = scrapertools.htmlclean(title)
        url = urlparse.urljoin(
            item.url, scrapertools.find_single_match(match,
                                                     'a href="([^"]+)"'))

        thumbnail = scrapertools.find_single_match(
            match, 'data-uri="public\:\/\/([^"]+)"')
        thumbnail = "http://eltrecetv.cdncmd.com/sites/default/files/styles/298x168/public/" + thumbnail
        plot = scrapertools.find_single_match(match, '<p>([^<]+)</p>')

        if (DEBUG):
            logger.info("title=[" + title + "], url=[" + url +
                        "], thumbnail=[" + thumbnail + "]")

        # Añade al listado
        itemlist.append(
            Item(channel=CHANNEL,
                 action="play",
                 server="eltrece",
                 title=title,
                 url=url,
                 thumbnail=thumbnail,
                 plot=plot,
                 fanart=thumbnail,
                 viewmode="movie_with_plot",
                 folder=False))

    # Paginación
    current_page = scrapertools.find_single_match(item.url, "page\=(\d+)")
    logger.info("tvalacarta.channels.eltrece programas current_page=" +
                current_page)
    if current_page == "":
        next_page_url = item.url + "?page=1"
    else:
        next_page_url = item.url.replace("page=" + current_page,
                                         "page=" + str(int(current_page) + 1))
    logger.info("tvalacarta.channels.eltrece programas next_page_url=" +
                next_page_url)
    itemlist.append(
        Item(channel=CHANNEL,
             action="episodios",
             title=">> Página siguiente",
             url=next_page_url,
             folder=True))

    return itemlist
Beispiel #57
0
    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
        res = ''
        diskpath = os.path.join(self.current_dir, into_dir)
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
            tags = list(soup.findAll('a', href=True))

            for c, tag in enumerate(tags):
                if self.show_progress:
                    print '.',
                    sys.stdout.flush()
                sys.stdout.flush()
                iurl = self.absurl(baseurl,
                                   tag,
                                   'href',
                                   filter=recursion_level != 0)
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
                if self.filemap.has_key(nurl):  # noqa
                    self.localize_link(tag, 'href', self.filemap[nurl])
                    continue
                if self.files > self.max_files:
                    return res
                linkdir = 'link' + str(c) if into_dir else ''
                linkdiskpath = os.path.join(diskpath, linkdir)
                if not os.path.exists(linkdiskpath):
                    os.mkdir(linkdiskpath)
                try:
                    self.current_dir = linkdiskpath
                    dsrc = self.fetch_url(iurl)
                    newbaseurl = dsrc.newurl
                    if len(dsrc) == 0 or \
                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %r' % iurl)
                    if callable(self.encoding):
                        dsrc = self.encoding(dsrc)
                    elif self.encoding is not None:
                        dsrc = dsrc.decode(self.encoding, 'replace')
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]

                    st = time.time()
                    soup = self.get_soup(dsrc, url=iurl)
                    self.log.debug('Parsed %s in %.1f seconds' %
                                   (iurl, time.time() - st))

                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
                    self.log.debug('Processing images...')
                    self.process_images(soup, newbaseurl)
                    if self.download_stylesheets:
                        self.process_stylesheets(soup, newbaseurl)

                    _fname = basename(iurl)
                    if not isinstance(_fname, unicode):
                        _fname.decode('latin1', 'replace')
                    _fname = _fname.encode('ascii', 'replace').replace(
                        '%', '').replace(os.sep, '')
                    _fname = ascii_filename(_fname)
                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                    res = os.path.join(linkdiskpath, _fname)
                    self.downloaded_paths.append(res)
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
                        self.log.debug('Processing links...')
                        self.process_links(soup, newbaseurl,
                                           recursion_level + 1)
                    else:
                        self.process_return_links(soup, newbaseurl)
                        self.log.debug(
                            'Recursion limit reached. Skipping links in', iurl)

                    if newbaseurl and not newbaseurl.startswith('/'):
                        for atag in soup.findAll(
                                'a', href=lambda x: x and x.startswith('/')):
                            atag['href'] = urlparse.urljoin(
                                newbaseurl, atag['href'], True)
                    if callable(self.postprocess_html_ext):
                        soup = self.postprocess_html_ext(
                            soup, c == 0 and recursion_level == 0
                            and not getattr(self, 'called_first', False),
                            self.job_info)

                        if c == 0 and recursion_level == 0:
                            self.called_first = True

                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception as err:
                    if isinstance(err, AbortArticle):
                        raise
                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.log.exception('Could not fetch link', iurl)
                finally:
                    self.current_dir = diskpath
                    self.files += 1
        finally:
            self.current_dir = prev_dir
        if self.show_progress:
            print
        return res
    # Start running maps and POSTs
    for m in dataMaps:
      #print 'Building %s map' % m
      data = {
        'episode_id': 1
      }
      for d in dataMaps[m]['map']:
        if parts[dataMaps[m]['map'][d]] != '-':
          #print '%s: %s' % (d, parts[dataMaps[m]['map'][d]])
          data[d] = parts[dataMaps[m]['map'][d]]

      #print 'data is:'
      #print data

      #print 'Making url from %s and %s' % (baseUrl, dataMaps[m]['uri'])
      url = urlparse.urljoin(baseUrl, dataMaps[m]['uri'])
      print url
      urlBits = urlparse.urlparse(url)

      params = urllib.urlencode({'number': 12524, 'type': 'issue', 'action': 'show'})
      headers = {
        "Content-type": "application/json",
        "Accept": "application/json",
        "Authorization": "Token " + token
      }

      print 'sending ' + json.dumps(data, separators=(',',':'))

      response = requests.post(url, data=json.dumps(data, separators=(',',':')), headers=headers)
      #print response.status, response.reason
Beispiel #59
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.
                             has_key('src')):  # noqa
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlparse.urlsplit(iurl).scheme:
                 iurl = urlparse.urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):  # noqa
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         if isinstance(fname, unicode):
             fname = fname.encode('ascii', 'replace')
         itype = what(None, data)
         if itype is None and b'<svg' in data[:1024]:
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
Beispiel #60
0
    def _get_issuer_publickey(self, issuer, key_id=None, insecure=False):
        
        # Set the user agent so Cloudflare isn't mad at us
        headers={'User-Agent': 'SciTokens/{}'.format(PKG_VERSION)}
        
        # Go to the issuer's website, and download the OAuth well known bits
        # https://tools.ietf.org/html/draft-ietf-oauth-discovery-07
        well_known_uri = ".well-known/openid-configuration"
        if not issuer.endswith("/"):
            issuer = issuer + "/"
        parsed_url = urlparse.urlparse(issuer)
        updated_url = urlparse.urljoin(parsed_url.path, well_known_uri)
        parsed_url_list = list(parsed_url)
        parsed_url_list[2] = updated_url
        meta_uri = urlparse.urlunparse(parsed_url_list)
        
        # Make sure the protocol is https
        if not insecure:
            parsed_url = urlparse.urlparse(meta_uri)
            if parsed_url.scheme != "https":
                raise NonHTTPSIssuer("Issuer is not over HTTPS.  RFC requires it to be over HTTPS")
        response = request.urlopen(request.Request(meta_uri, headers=headers))
        data = json.loads(response.read().decode('utf-8'))
        
        # Get the keys URL from the openid-configuration
        jwks_uri = data['jwks_uri']
        
        # Now, get the keys
        if not insecure:
            parsed_url = urlparse.urlparse(jwks_uri)
            if parsed_url.scheme != "https":
                raise NonHTTPSIssuer("jwks_uri is not over HTTPS, insecure!")
        response = request.urlopen(request.Request(jwks_uri, headers=headers))
        keys_data = json.loads(response.read().decode('utf-8'))
        # Loop through each key, looking for the right key id
        public_key = ""
        raw_key = None
        
        # If there is no kid in the header, then just take the first key?
        if key_id == None:
            if len(keys_data['keys']) != 1:
                raise NotImplementedError("No kid in header, but multiple keys in "
                                          "response from certs server.  Don't know which key to use!")
            else:
                raw_key = keys_data['keys'][0]
        else:
            # Find the right key
            for key in keys_data['keys']:
                if key['kid'] == key_id:
                    raw_key = key
                    break

        if raw_key == None:
            raise MissingKeyException("Unable to find key at issuer {}".format(jwks_uri))

        if raw_key['kty'] == "RSA":
            public_key_numbers = rsa.RSAPublicNumbers(
                long_from_bytes(raw_key['e']),
                long_from_bytes(raw_key['n'])
            )
            public_key = public_key_numbers.public_key(backends.default_backend())
        elif raw_key['kty'] == 'EC':
            public_key_numbers = ec.EllipticCurvePublicNumbers(
                   long_from_bytes(raw_key['x']),
                   long_from_bytes(raw_key['y']),
                   ec.SECP256R1
               )
            public_key = public_key_numbers.public_key(backends.default_backend())
        else:
            raise UnsupportedKeyException("SciToken signed with an unsupported key type")
        
        return public_key