Beispiel #1
0
def listVideos(url):
    content = getUrl(url)
    contenttop = content[content.find('<div class="topStoryWrapper clear">'):]
    contenttop = contenttop[:contenttop.find('<div class="subcategoryList clear">')]
    titletop = contenttop[contenttop.find('<h2 class="topStoryTitle">'):]
    match = re.compile('<a href="(.+?)" title="(.+?)"', re.DOTALL).findall(titletop)
    url="http://"+language2+".euronews.com"+match[0][0]
    title=match[0][1]
    title=HTMLParser().unescape(title.decode('utf-8'))
    title=title.encode('utf-8')
    match = re.compile('src="(.+?)"', re.DOTALL).findall(titletop)
    thumb =  match[0]
    match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(titletop)
    desc=match[0]
    match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(contenttop)
    datum= match[0]
    debug("TITLE: " + title)
    debug("URL: " + url)
    addLink(datum +" - "+title, url, 'playVideo', thumb, desc)
    spl = content.split('<li class="clearAfter fixedHeight">')
    
    for i in range(1, len(spl), 1):   
        element=spl[i]
        match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(element)
        datum= match[0]
        debug("++++++++ "+ datum)
        sp2 = element.split('<a title="INSIDERS"')
        for i2 in range(0, len(sp2), 1):
            element=sp2[i2]
            debug("---------")
            debug(element)
            debug("---------")
            match = re.compile('href="([^"]+?)"[ ]+title="([^"]+?)"', re.DOTALL).findall(element)
            if not match:
               debug("Keine Url")
               continue
            url="http://"+language2+".euronews.com"+match[0][0]
            title=match[0][1]
            match = re.compile('src="(.+?)"', re.DOTALL).findall(element)
            if match:
              thumb =  match[0]            
            else:
               thump=""
            match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(element)
            if match:
                desc=match[0]
            else :
                 desc=""
            debug("URL :" + url)
            title=HTMLParser().unescape(title.decode('utf-8'))
            title=title.encode('utf-8')
            addLink( datum +" - "+title, url, 'playVideo', thumb, desc)
    xbmcplugin.endOfDirectory(pluginhandle)
    if forceViewMode == "true":
        xbmc.executebuiltin('Container.SetViewMode('+viewMode+')')
Beispiel #2
0
    def _download_video_subtitles(video_id):
        """Download English subtitles from video and return name of the SRT file.

        Args:
            video_id (str): ID of the video of which subtitiles are downloaded.

        Raises:
            AttributeError: if English subtitles not found.

        Returns:
            str: name of the SRT file where subtitles are located.
        """
        response = urllib2.urlopen(
            'http://video.google.com/timedtext?lang=en&v={}'.format(video_id))
        xml_text = response.read()

        if not xml_text:
            raise AttributeError('English subtitles not found')

        # Replace all escaped characters with unicode.
        xml_text = HTMLParser().unescape(xml_text.decode('utf-8'))

        srt_text = xml2srt.convert(xml_text)

        subtitles_filename = '{}.srt'.format(video_id)
        with open(subtitles_filename, 'w') as outfile:
            outfile.write(srt_text.encode('utf8'))

        return subtitles_filename
Beispiel #3
0
def unescape(string):
	unescaped_string = HTMLParser().unescape(string)
	unescaped_string_type = type(unescaped_string)

	if unescaped_string_type == str:
		return unescaped_string.decode('utf-8')
	else:
		return unescaped_string
def decoder(val, encodings=ENCODINGS):
    if not hasattr(val, 'decode'): return val
    val = HTMLParser().unescape(val)
    r = None
    for e in encodings:
        try:
            r = val.decode(e)
            break
        except Exception, err:
            # keep iterating til the correct encoding is discovered
            continue
def decoder(val, encodings=ENCODINGS):
    if not hasattr(val, 'decode'): return val
    val = HTMLParser().unescape(val)
    r = None
    for e in encodings:
        try:
            r = val.decode(e)
            break
        except Exception, err:
            # keep iterating til the correct encoding is discovered
            continue
Beispiel #6
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  # 0,1,2 = URL, regexOnly, CookieJarOnly
    # cachedPages = {}
    # print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    #        print 'doRegexs',doRegexs,regexs
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            # print 'processing ' ,k
            m = regexs[k]
            # print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                # print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            # print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar is None:
                    # print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
                    #                            print 'cookieJar from file name',cookie_jar_file

                    cookieJar = getCookieJar(cookie_jar_file)
                    #                        print 'cookieJar from file',cookieJar
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                    # import cookielib
                    # cookieJar = cookielib.LWPCookieJar()
                    # print 'cookieJar new',cookieJar
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    #                        print 'complete_path',complete_path
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
            #                    print 'post is now',m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                # print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())

            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly is False:
                # print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())

                    # print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

                    #                            if
                    #                            proxy = ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
                    #                            opener = build_opener(proxy)
                    #                            install_opener(opener)

                    #                        print 'getproxies',getproxies()
                    current_proxies = ProxyHandler(getproxies())

                    # print 'getting pageUrl',pageUrl
                    req = Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        #                            print 'proxytouse',proxytouse
                        #                            getproxies= lambda: {}
                        if pageUrl[:5] == "https":
                            proxy = ProxyHandler({'https': proxytouse})
                            # req.set_proxy(proxytouse, 'https')
                        else:
                            proxy = ProxyHandler({'http': proxytouse})
                            # req.set_proxy(proxytouse, 'http')
                        opener = build_opener(proxy)
                        install_opener(opener)

                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(version=0,
                                                  name=n,
                                                  value=v,
                                                  port=None,
                                                  port_specified=False,
                                                  domain=w,
                                                  domain_specified=False,
                                                  domain_initial_dot=False,
                                                  path='/',
                                                  path_specified=True,
                                                  secure=False,
                                                  expires=None,
                                                  discard=True,
                                                  comment=None,
                                                  comment_url=None,
                                                  rest={'HttpOnly': None},
                                                  rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if cookieJar is not None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = HTTPCookieProcessor(cookieJar)
                        opener = build_opener(cookie_handler,
                                              HTTPBasicAuthHandler(),
                                              HTTPHandler())
                        opener = install_opener(opener)
                        #                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = build_opener(cookie_handler,
                                                  NoRedirection,
                                                  HTTPBasicAuthHandler(),
                                                  HTTPHandler())
                            opener = install_opener(opener)
                    elif 'noredirect' in m:
                        opener = build_opener(NoRedirection,
                                              HTTPBasicAuthHandler(),
                                              HTTPHandler())
                        opener = install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = build_opener(keepalive_handler)
                        install_opener(opener)

                    # print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        # if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                        # if '$LiveStreamRecaptcha' in post:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                    link = ''
                    try:

                        if post:
                            response = urlopen(req, post)
                        else:
                            response = urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and not current_proxies is None:
                            install_opener(build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        # print repr(link)
                        # print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            # link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'
                        #                        print link

                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    # print link
                    # print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)

            if not m['expres'] == '':
                # print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    # print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    # print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        # print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] is None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = quote_plus(val)
                    if 'htmlunescape' in m:
                        # val=unquote_plus(val)
                        try:
                            from HTMLParser import HTMLParser
                        except ImportError:
                            from html.parser import HTMLParser
                        val = HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                    # print 'ur',url
                    # return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall: return url
    # print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved