Beispiel #1
0
    def extract(self, verbose=False):

        if not self.ok or self.len_url != len(self.url.split("/")):
            #~ print self.url, self.ok
            #~ print "None(1)"
            return None

        xpath = XPath(self.url)
        if xpath is None:
            #~ print "None(2)"
            return None

        mds = {}
        data = {}

        lost_safety = 0

        for md in self.domain['md']:
            if verbose:
                print "%s:" % md
            if "@url" in self.domain['md'][md]:
                pos = self.domain['md'][md].split("[")[1].split("]")[0]
                #~ print url.split("/")[int(pos)]
                #~ html += u"<br/>\n<b>%s:</b> %s" % (u(md), u(url.split("/")[int(pos)]))
                ex = self.url.split("/")[int(pos)]
                if is_valid_meta(ex, md):
                    data[md] = ex

            else:
                extract = ""
                safety = False
                if "candidate" in self.domain['md'][md]:
                    extract = xpath.extract(self.domain['md'][md]['candidate'],
                                            True)

                    #~ if md == "image" and extract != "":
                    #~ print "IMAGE", self.domain['md'][md]['candidate'], extract
                    #~ print md, extract, self.domain['md'][md]['candidate']

                    safety = True
                extracts = []
                if extract == "":
                    #Si lo ha encontrado en muchos sitios no es fiable, mejor ignorar
                    #~ print self.domain['md'][md]['all']
                    #~ if len(self.domain['md'][md]['all']) > 8:
                    #~ print "********************"
                    #~ print md
                    #~ print len(self.domain['md'][md]['all'])
                    #~ continue

                    safety = False
                    safety_val = 0
                    #Para que un valor se considere seguro al menos un 50% de las apariciones han tenido que ser ahí
                    safety_val = sum(
                        c for c in self.domain['md'][md]['all'].values()
                        if c > 1) * 0.3

                    xps = self.domain['md'][md]['all']
                    sorted_xp = reversed(
                        sorted(xps.iteritems(), key=operator.itemgetter(1)))

                    for xp_tuple in sorted_xp:
                        xp = xp_tuple[0]
                        try:
                            if "comment" in xp:
                                #para que no se lie con comentarios
                                continue
                            #~ print "\t", xp
                            if "@url" in xp:
                                pos = xp.split("[")[1].split("]")[0]
                                #~ print url.split("/")[int(pos)]
                                #~ html += u"<br/>\n<b>%s:</b> %s" % (u(md), u(url.split("/")[int(pos)]))
                                #~ print self.url

                                ex = self.url.split("/")[int(pos)]
                                if is_valid_meta(ex, md):
                                    data[md] = ex

                                break
                            if self.domain['md'][md]['all'][xp] == 1:
                                #una única aparación es ignorada
                                continue

                            extract = xpath.extract(xp, True)
                            if verbose:
                                print "\t", xp, extract

                            #~ if md == "description":
                            #~ print
                            #~ print
                            #~ print
                            #~ print
                            #~ print md, xp, self.domain['md'][md]['all'][xp], len(extract)
                            #~ if len(extract)<100:
                            #~ print extract

                            if extract == "":
                                continue

                            #~ print "\t\t", extract
                            #~ print md, extract, xp

                            extracts.append(extract)
                            #fiable si aparece mucho
                            if self.domain['md'][md]['all'][xp] > safety_val:
                                safety = True
                                break
                        except (UnicodeDecodeError, TypeError):
                            continue

                #~ if md == "description":
                #~ print len(extract), len(extracts), safety, md, is_valid_meta(extract, md)
                #~ print is_script(extract)

                _md = {}
                #~ print md, extracts
                if safety:
                    #~ print md, extracts
                    safety = is_valid_meta(extract, md)
                    if not safety:
                        lost_safety += 1

                    #~ print safety

                    if isinstance(safety, basestring):
                        if not md in [
                                "size", "infohash", "category", "episode",
                                "season"
                        ]:
                            md = safety.split("_")[0]
                        else:
                            extract = safety

                        _md[md] = extract
                        safety = True

                if not safety:
                    ok = False

                    def compare(x, y):
                        return len(y) - len(x)

                    #~ _extract = {}
                    #para la descripción prima la más grande pero para el resto la primera aparación
                    extracts_sorted = sorted(
                        extracts,
                        cmp=compare) if md != "category" else extracts
                    for ext in extracts_sorted:

                        rt = is_valid_meta(ext, md)
                        if self.debug:
                            print "ITEM:", ext, extracts, rt

                        #~ if md == "category":
                        #~ print "-----"
                        #~ print ext, extracts, rt

                        if rt:
                            if md in _md:
                                #~ if md != "description":
                                #~ print "Descartando", ext, md, _md[md]
                                continue
                            ok = True
                            if isinstance(rt, bool):
                                #~ extract = ext
                                _md[md] = ext
                            else:
                                #~ print md, rt, ext
                                if md in [
                                        "size", "infohash", "episode", "season"
                                ]:
                                    #~ extract = rt
                                    _md[md] = rt
                                    #~ _extract.append(ext)
                                else:
                                    #~ print "...", rt, ext
                                    if rt in ext.lower():
                                        #~ extract = rt
                                        _md[md] = rt
                                        #~ _md.append(md)
                                        #~ _extract.append(ext)
                                    else:
                                        #~ print rt, extract, ext
                                        #~ print "[[", rt
                                        #~ md = rt.split("_")[0]
                                        #~ _md.append(rt.split("_")[0])
                                        #~ print md

                                        #~ _extract.append(ext)
                                        _md[rt.split("_")[0]] = ext

                    if not ok:
                        if len("".join(extracts)) < 2:
                            continue
                        extract += "\n\t\t[%s]\n" % (
                            "\n\t\t * ".join(extracts))

                #~ print "\t\t%s"%extract
                if safety and len(_md) == 0:
                    _md[md] = extract

                for md, extract in _md.items():
                    if not is_valid_meta(extract, md):

                        #~ print "NOT VALID", extract if len(extract) < 100 else len(extract), md
                        continue

                    if md == "image":
                        if extract != "":
                            mds[md] = {"safety": safety, "data": [extract]}
                            #~ html += u"<br/>\n<b>Imagen%s:</b> <img src='%s'>" % ("(SAFE)" if safety else "", extract)
                    else:

                        if md == "infohash":
                            extract = extract_infohash(extract)
                        if md == "size":
                            #~ print extract
                            z = is_size(extract)
                            #~ print z
                            if z:
                                extract = z

                        if md in mds:
                            mds[md]['data'].append(
                                str(extract).encode("utf-8"))
                        else:
                            try:
                                mds[md] = {
                                    "safety": safety,
                                    "data": [str(extract)]
                                }
                            except UnicodeEncodeError:
                                try:
                                    mds[md] = {
                                        "safety": safety,
                                        "data": [extract.encode("utf-8")]
                                    }
                                except UnicodeEncodeError:
                                    try:
                                        mds[md] = {
                                            "safety": safety,
                                            "data": [extract.decode("utf-8")]
                                        }
                                    except:
                                        print
                                        print
                                        print
                                        print extract
                                        raise

        #~ html += "<br/><br/><br/><br/>"
        #~ print mds
        description = None
        image = None
        category = None
        subcategory = None
        genre = None
        episode = None
        quality = None
        title = None
        all_categories = None

        extracts = {}

        for md in mds:
            safety = mds[md]['safety']
            extract = ",".join(set(mds[md]['data']))

            #~ print "****", md, extract

            if md == "image":
                if extract != "":
                    imgs = re.findall(r'<img[^>]*\ssrc="(.*?)"', extract)
                    if imgs:
                        data[md] = imgs[0]
                        #~ html += u"<br/>\n<b>Imagen%s:</b> %s" % ("(SAFE)" if safety else "", extract)
                        image = True
            else:
                try:
                    if md == "infohash":
                        #~ print "********************"
                        extract = extract_infohash(extract)
                    if md == "category":
                        if "," in extract:
                            all_categories = extract
                            extract = extract.split(",")[0]
                        category = unicode(extract, "utf-8")
                    if md == "title":
                        title = unicode(extract, "utf-8")
                    if md == "subcategory":
                        subcategory = unicode(extract, "utf-8")
                    if md == "genre":
                        genre = unicode(extract, "utf-8")
                    if md == "description":
                        description = unicode(extract, "utf-8")
                    if md == "episode":
                        episode = unicode(extract, "utf-8")
                    if md == "quality":
                        quality = unicode(extract, "utf-8")
                    if md == "size":
                        #~ print extract
                        if not extract.isdigit():
                            extract = is_size(extract)
                        #~ if extract:
                        #~ print "size:" + extract
                    data[md] = unicode(extract, "utf-8")

                    #~ html += u"<br/>\n<b>%s%s:</b> %s" % (md, u"(SAFE)" if safety else u"", unicode(extract, "utf-8"))
                except (UnicodeDecodeError, UnicodeEncodeError):
                    print "Error de codificacion (%d)" % len(extract)
                    raise

                    #~ print xp

        #keywords
        if title:
            rt = extract_keywords(title)
            if rt:
                data[u"keywords"] = rt

        if not quality and (title or "keywords" in data):
            rt = is_quality(title)
            if rt:
                data[u"quality"] = rt
            else:
                if "keywords" in data:
                    for kw in data[u"keywords"].split(","):
                        rt = is_quality(kw)
                        if rt:
                            data[u"quality"] = kw
                            break

        if not image and description and "src=" in description:
            imgs = re.findall(r'<img[^>]*\ssrc="(.*?)"', description)
            if len(imgs) > 0:
                #~ html += u"<br/>\n<b>Imagen(DESC):</b> <img src='%s'></img>" % (imgs[0])
                data[u"image"] = imgs[0]

        if not category and subcategory:
            category = get_category_from_subcategory(subcategory)
            if category:
                #~ html += u"<br/>\n<b>category(subc):</b> %s" % (category)
                data[u"category"] = category
        if not category and genre:
            category = get_category_from_genre(genre)
            if category:
                #~ html += u"<br/>\n<b>category(gen):</b> %s" % (category)
                data[u"category"] = category

        if not episode and title:
            rt = is_season_episode(title)
            if rt:
                #~ html += u"<br/>\n<b>episode(TIT):</b> %s" % (rt['e'])
                data[u"episode"] = rt['e']
                #~ html += u"<br/>\n<b>season(TIT):</b> %s" % (rt['s'])
                data[u"season"] = rt['s']
                if not category:
                    #~ html += u"<br/>\n<b>category(se):</b> series"
                    data[u"category"] = "series"

        if data:
            data[u"schema"] = get_schema(data)

            if "category" in data:
                tags = ""

                if all_categories:
                    tags += ",".join([
                        get_tags(c) for c in all_categories.split(",")
                        if get_tags(c)
                    ])
                else:
                    tags += get_tags(data['category'])

                if tags:
                    if 'tags' in data:
                        data['tags'] += ",%s" % tags
                    else:
                        data['tags'] = tags

        deleted = []
        for k in data:
            if isinstance(data[k], bool):
                deleted.append(k)
            else:
                data[k] = HTMLParser().unescape(data[k])

        for k in deleted:
            del data[k]

        return data
Beispiel #2
0
    def getSource(self, url, form_data, referer, xml=False, mobile=False):
        url = self.fixurl(url)

        if not referer:
            referer = url
        else:
            referer = self.fixurl(referer)

        headers = {'Referer': referer}
        if mobile:
            self.s.headers.update({
                'User-Agent':
                'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
            })

        if xml:
            headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'dinozap.info' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.111'
        if 'playerhd2.pw' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.121'
        if 'playerapp1.pw' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.122'

        if 'finecast.tv' in urlparse.urlsplit(url).netloc:
            self.s.headers.update(
                {'Cookie': 'PHPSESSID=d08b73a2b7e0945b3b1bb700f01f7d72'})

        if form_data:
            #ca**on.tv/key.php
            if 'uagent' in form_data[0]:
                form_data[0] = ('uagent',
                                urllib.quote(self.s.headers['User-Agent']))

            if '123456789' in form_data[0]:
                import random
                cotok = str(random.randrange(100000000, 999999999))
                form_data[0] = ('token', cotok)
                r = self.s.post(url,
                                headers=headers,
                                data=form_data,
                                timeout=20,
                                cookies={'token': cotok})
            else:
                r = self.s.post(url,
                                headers=headers,
                                data=form_data,
                                timeout=20)
            response = r.text
        else:
            try:
                r = self.s.get(url, headers=headers, timeout=20)
                response = r.text
            except (requests.exceptions.MissingSchema):
                response = 'pass'
        print(">>>>>>>>>>>>> LEN <<<<<<<<<", len(response))
        #if len(response) > 10:
        if self.cookie_file:
            self.save_cookies_lwp(self.s.cookies, self.cookie_file)
        return HTMLParser().unescape(response)
 def fill_details_from_wiki(self, url):
     code = ""
     try:
         u = urlopen(url)
     except:
         print("AddonManager: Debug: unable to open URL", url)
         return
     if u is None:
         print(
             "AddonManager: Debug: connection is lost (proxy setting changed?)",
             url)
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     # check if the macro page has its code hosted elsewhere, download if needed
     if "rawcodeurl" in p:
         rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">", p)
         if rawcodeurl:
             rawcodeurl = rawcodeurl[0]
             try:
                 u2 = urlopen(rawcodeurl)
             except:
                 print("AddonManager: Debug: unable to open URL",
                       rawcodeurl)
                 return
             # code = u2.read()
             # github is slow to respond... We need to use this trick below
             response = ""
             block = 8192
             #expected = int(u2.headers['content-length'])
             while 1:
                 #print("expected:",expected,"got:",len(response))
                 data = u2.read(block)
                 if not data:
                     break
                 if sys.version_info.major >= 3 and isinstance(data, bytes):
                     data = data.decode('utf-8')
                 response += data
             if response:
                 code = response
             u2.close()
     if not code:
         code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace('--endl--', '\n')
         else:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller",
                           "Unable to fetch the code of this macro."))
         # Clean HTML escape codes.
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         if sys.version_info.major < 3:
             code = code.decode('utf8')
         try:
             code = HTMLParser().unescape(code)
             code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ')
         except:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller", "Unable to clean macro code")
                 + ": " + code + '\n')
         if sys.version_info.major < 3:
             code = code.encode('utf8')
     desc = re.findall(
         "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
         p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #4
0
    def getSource(self, url, form_data, referer, xml=False, mobile=False):
        url = self.fixurl(url)

        if not referer:
            referer = url
        else:
            referer = self.fixurl(referer.replace('wizhdsports.be','wizhdsports.is').replace('ibrod.tv','www.ibrod.tv').replace('livetv123.net','livetv.sx'))
        
        headers = {'Referer': referer}
        if mobile:
            self.s.headers.update({'User-Agent' : 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13E238 Safari/601.1'})
            
        if xml:
            headers['X-Requested-With'] = 'XMLHttpRequest'
            
              
        if 'cndhlsstream.pw' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'skstream.tv' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'bstream.tech' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'bcast.site' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'bcast.pw' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'live247.online' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'indexstream.tv' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        
        if 'streamlive.to' in urlparse.urlsplit(url).netloc:
            self.s.verify = False

        if 'vipleague' in url or 'strikeout' in url or 'homerun' or 'nbastreams' in url:
            self.s.verify = False
            
        #if 'dinostream.pw' in urlparse.urlsplit(url).netloc:
            #self.s.headers.update({'Upgrade-Insecure-Requests': '1'})
            # self.s.headers.update({'Host': 'wwww.dinostream.pw'})
            # self.s.headers.update({'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'})
            # self.s.headers.update({'Accept-Language' : 'en-US,en;q=0.8,de;q=0.6,es;q=0.4'})
            # self.s.headers.update({'Accept-Encoding': 'gzip, deflate, sdch'})
            # self.s.headers.update({'Connection' : 'keep-alive'})
        

        if form_data:
            #zo**tv
            #if 'uagent' in form_data[0]:
               #form_data[0] = ('uagent',urllib.quote(self.s.headers['User-Agent']))
                #if len(form_data) > 4 and 'Cookie' in form_data[4]:
                    #headers['Cookie'] = form_data[4][1]
                    #del form_data[4]
                   
                #headers['Content-Type'] = 'application/x-www-form-urlencoded'
                #headers['User-Agent'] = self.s.headers['User-Agent']
                #lib.common.log("JairoX10:" + form_data[0][1])
               

            r = self.s.post(url, headers=headers, data=form_data, timeout=20)
        else:
            try:
                r = self.s.get(url, headers=headers, timeout=20)
            except (requests.exceptions.MissingSchema):
                return 'pass'
        
        #many utf8 encodings are specified in HTTP body not headers and requests only checks headers, maybe use html5lib
        #https://github.com/kennethreitz/requests/issues/2086
        if 'streamlive.to' in urlparse.urlsplit(url).netloc \
        or 'sport365.live' in urlparse.urlsplit(url).netloc \
        or 'vipleague' in urlparse.urlsplit(url).netloc \
        or 'cinestrenostv.tv' in urlparse.urlsplit(url).netloc \
        or 'batmanstream.com' in urlparse.urlsplit(url).netloc \
        or 'sportcategory.com' in urlparse.urlsplit(url).netloc:
            r.encoding = 'utf-8'
        if 'lfootball.ws' in urlparse.urlsplit(url).netloc:
            r.encoding = 'windows-1251'
            
        response  = r.text
                

        
        if 'beget=begetok' in response: # av
            _cookie = requests.cookies.create_cookie('beget','begetok',domain=urlparse.urlsplit(url).netloc,path='/')
            self.s.cookies.set_cookie(_cookie)
            r = self.s.get(url, headers=headers, timeout=20)
            response  = r.text

        if 'fromCharCode,sucuri_cloudproxy_js' in response: # sebn
            from sucuri import sucuri_decode
            sucuri_name, sucuri_value = sucuri_decode(response)
            sucuri_cookie = requests.cookies.create_cookie(sucuri_name,sucuri_value,domain=urlparse.urlsplit(url).netloc,path='/',
                                                           discard=False,expires=(time.time() + 86400))
            self.s.cookies.set_cookie(sucuri_cookie)
            r = self.s.get(url, headers=headers, timeout=20)
            response  = r.text
        
        if len(response) > 10:
            self.s.cookies.save(ignore_discard=True)

        self.s.close()
        return HTMLParser().unescape(response)
Beispiel #5
0
            return default, [first]

        encoding = find_cookie(second)
        if encoding:
            return encoding, [first, second]

        return default, [first, second]


# For converting & <-> &amp; etc.
try:
    from html import escape
except ImportError:
    from cgi import escape
if sys.version_info[:2] < (3, 4):
    unescape = HTMLParser().unescape
else:
    from html import unescape

try:
    from collections import ChainMap
except ImportError:  # pragma: no cover
    from collections import MutableMapping

    try:
        from reprlib import recursive_repr as _recursive_repr
    except ImportError:

        def _recursive_repr(fillvalue='...'):
            '''
            Decorator to make a repr function return fillvalue for a recursive
Beispiel #6
0
class RTBF(Plugin):
    GEO_URL = 'https://www.rtbf.be/api/geoloc'
    TOKEN_URL = 'https://token.rtbf.be/'
    RADIO_STREAM_URL = 'http://www.rtbfradioplayer.be/radio/liveradio/rtbf/radios/{}/config.json'

    _url_re = re.compile(r'https?://(?:www\.)?(?:rtbf\.be/auvio/.*\?l?id=(?P<video_id>[0-9]+)#?|rtbfradioplayer\.be/radio/liveradio/(?:webradio-)?(?P<radio>.+))')
    _stream_size_re = re.compile(r'https?://.+-(?P<size>\d+p?)\..+?$')

    _video_player_re = re.compile(r'<iframe\s+class="embed-responsive-item\s+js-embed-iframe".*src="(?P<player_url>.+?)".*?</iframe>', re.DOTALL)
    _video_stream_data_re = re.compile(r'<div\s+id="js-embed-player"\s+class="js-embed-player\s+embed-player"\s+data-media="(.+?)"')

    _geo_schema = validate.Schema(
        {
            'country': validate.text,
            'zone': validate.text
        }
    )

    _video_stream_schema = validate.Schema(
        validate.transform(_video_stream_data_re.search),
        validate.any(
            None,
            validate.all(
                validate.get(1),
                validate.transform(HTMLParser().unescape),
                validate.transform(parse_json),
                {
                    'geoLocRestriction': validate.text,
                    validate.optional('isLive'): bool,
                    validate.optional('startDate'): validate.text,
                    validate.optional('endDate'): validate.text,
                    'sources': validate.any(
                        [],
                        validate.Schema({
                            validate.text: validate.any(None, '', validate.url())
                        })
                    ),
                    validate.optional('urlHls'): validate.any(None, '', validate.url()),
                    validate.optional('urlDash'): validate.any(None, '', validate.url()),
                    validate.optional('streamUrlHls'): validate.any(None, '', validate.url()),
                    validate.optional('streamUrlDash'): validate.any(None, '', validate.url())
                }
            )
        )
    )

    _radio_stream_schema = validate.Schema(
        {
            'audioUrls': validate.all(
                [{
                    'url': validate.url(),
                    'mimeType': validate.text
                }]
            )
        }
    )

    @classmethod
    def check_geolocation(cls, geoloc_flag):
        if geoloc_flag == 'open':
            return True

        res = http.get(cls.GEO_URL)
        data = http.json(res, schema=cls._geo_schema)
        return data['country'] == geoloc_flag or data['zone'] == geoloc_flag

    @classmethod
    def tokenize_stream(cls, url):
        res = http.post(cls.TOKEN_URL, data={'streams[url]': url})
        data = http.json(res)
        return data['streams']['url']

    @staticmethod
    def iso8601_to_epoch(date):
        # Convert an ISO 8601-formatted string date to datetime
        return datetime.datetime.strptime(date[:-6], '%Y-%m-%dT%H:%M:%S') + \
            datetime.timedelta(hours=int(date[-6:-3]), minutes=int(date[-2:]))

    @classmethod
    def can_handle_url(cls, url):
        return RTBF._url_re.match(url)

    def _get_radio_streams(self, radio):
        res = http.get(self.RADIO_STREAM_URL.format(radio.replace('-', '_')))
        streams = http.json(res, schema=self._radio_stream_schema)

        for stream in streams['audioUrls']:
            match = self._stream_size_re.match(stream['url'])
            if match is not None:
                quality = '{}k'.format(match.group('size'))
            else:
                quality = stream['mimetype']
            yield quality, HTTPStream(self.session, stream['url'])

    def _get_video_streams(self):
        res = http.get(self.url)
        match = self._video_player_re.search(res.text)
        if match is None:
            return
        player_url = match.group('player_url')
        stream_data = http.get(player_url, schema=self._video_stream_schema)
        if stream_data is None:
            return

        # Check geolocation to prevent further errors when stream is parsed
        if not self.check_geolocation(stream_data['geoLocRestriction']):
            self.logger.error('Stream is geo-restricted')
            return

        now = datetime.datetime.now()
        try:
            if isinstance(stream_data['sources'], dict):
                urls = []
                for profile, url in stream_data['sources'].items():
                    if not url or url in urls:
                        continue
                    match = self._stream_size_re.match(url)
                    if match is not None:
                        quality = match.group('size')
                    else:
                        quality = profile
                    yield quality, HTTPStream(self.session, url)
                    urls.append(url)

            hls_url = stream_data.get('urlHls') or stream_data.get('streamUrlHls')
            if hls_url:
                if stream_data.get('isLive', False):
                    # Live streams require a token
                    hls_url = self.tokenize_stream(hls_url)
                for stream in HLSStream.parse_variant_playlist(self.session, hls_url).items():
                    yield stream

        except IOError as err:
            if '403 Client Error' in str(err):
                # Check whether video is expired
                if 'startDate' in stream_data:
                    if now < self.iso8601_to_epoch(stream_data['startDate']):
                        self.logger.error('Stream is not yet available')
                elif 'endDate' in stream_data:
                    if now > self.iso8601_to_epoch(stream_data['endDate']):
                        self.logger.error('Stream has expired')

    def _get_streams(self):
        match = self.can_handle_url(self.url)
        if match.group('radio'):
            return self._get_radio_streams(match.group('radio'))
        return self._get_video_streams()
    def uncode_name(name):  # convert all the &# codes to char, remove extra-space and normalize
        from HTMLParser import HTMLParser

        name = name.replace('<![CDATA[', '').replace(']]', '')
        name = HTMLParser().unescape(name.lower())
        return name
Beispiel #8
0

class JsonReader(object):
    def __init__(self, json_path):
        self.json_path = json_path

    def get_json_data(self):
        data = dict()
        with open(self.json_path, 'r') as json_file:
            data = json.loads(json_file.read())
        return data


if __name__ == "__main__":

    curr_path = os.path.dirname(os.path.realpath(__file__))

    #HTML TEMPLATE
    htmlFilePath = os.path.join(curr_path, 'sample.html')
    with open(htmlFilePath, 'r') as content_file:
        content = content_file.read()

    #JSON DATA
    json_reader = JsonReader(os.path.join(curr_path, 'sample.json'))
    dbData = json_reader.get_json_data()

    outputHTML = HTMLParser(content, dbData).generateHTML()

    htmlResultFilePath = os.path.join(curr_path, 'result.html')
    with open(htmlResultFilePath, 'w') as content_file:
        content_file.write(outputHTML)
Beispiel #9
0
def scrap_commentary(match_link):
    link_split = match_link.split('/')
    match_id = link_split[len(link_split) - 1]
    page = urllib2.urlopen('http://www.goal.com' + match_link +
                           '/live-commentary')
    page_text = page.read()
    new_text = re.sub('<br>', '. ', page_text)
    soup = BeautifulSoup(new_text, "html.parser")
    i = 0
    global count
    print match_link
    global previousContent
    fullContent = ''
    ofile = open('matchData/' + match_id + '.csv', "wt")
    writer = csv.writer(ofile)
    ul = soup.find('ul', {'class': 'commentaries '})
    tags = ul.find_all('li')
    # print tags
    h = HTMLParser()
    if len(tags) > 0:
        for tag in tags:
            div = tag.find('div')
            event = tag["data-event-type"]
            minute = div.find('div', {'class': 'time'}).text
            divContent = div.find('div', {'class': 'text'})
            #To check if there is <br> tag
            brtags = div.find('div', {'class': 'text'}).findAll('br')
            if len(brtags) > 0:
                tagcontent = brtags[0].previousSibling
                print "Previous BR tag", tagcontent
                for brtag in brtags:
                    brContent = brtag.text
                    print brContent
                    tagcontent = tagcontent + "; " + brContent
            else:
                tagcontent = div.find('div', {'class': 'text'}).text
            if event == 'substitution':
                spans = div.find('div', {'class': 'text'}).findAll('span')
                tagcontent = tagcontent.replace('Substitution',
                                                'Substitution ')
                tagcontent = tagcontent.replace('\n', ' ')
                for span in spans:
                    tagcontent = tagcontent.replace(
                        span.text, ''.join(span['class']).encode('utf-8') +
                        ' ' + span.text)

            tagcontent = tagcontent.rstrip('\n')
            tagcontent = tagcontent.strip()
            tagcontent = h.unescape(tagcontent)
            minute = minute.rstrip("\n\r")[1:-1]
            if (bool(re.search(r'\d', minute))):
                print minute
                writer.writerow(
                    [minute + "'",
                     event.encode('utf-8'), tagcontent])
            else:
                writer.writerow(["", event.encode('utf-8'), tagcontent])
            fullContent = fullContent + tagcontent
            if (tagcontent):
                print(tagcontent)
        # db.goal_commentary.update({'match_id':match_id}, {"$set" : {'commentary' : fullContent}}, upsert=True)
        print('inserted')
    ofile.close()
    try:
        lineUpPage = urllib2.urlopen('http://www.goal.com' + match_link +
                                     '/lineups')
        soup = BeautifulSoup(lineUpPage, "html.parser")
        lineupfile = open('matchData/lineups/' + match_id + 'lineup.csv', "wt")
        writerLineup = csv.writer(lineupfile)
        div = soup.find('div', {'class': 'main-content lineups'})
        homeTeam = div.find('h2', {'class': 'home'}).text
        awayTeam = div.find('h2', {'class': 'away'}).text
        writerLineup.writerow([homeTeam, awayTeam])
        writerLineup.writerow(["Starting XI"])
        start11Div = div.find('div', {'class': 'players'})
        homeLis = start11Div.find('div', {
            'class': 'home'
        }).find('ul').find_all('li', {'data-side': 'home'})
        awayLis = start11Div.find('div', {
            'class': 'away'
        }).find('ul').find_all('li', {'data-side': 'away'})
        homecolumn = []
        awaycolumn = []
        for homeli in homeLis:
            playerNumber = homeli['data-number']
            playerName = homeli.find('a').find('span', {'class': 'name'}).text
            eventLis = homeli.find('a').find('ul', {
                'class': 'events'
            }).find_all('li')
            events = []
            if len(eventLis) > 0:
                for eventLi in eventLis:
                    eventAction = eventLi['class']
                    eventTime = str(eventLi.text)
                    print eventTime
                    events.append((eventAction[0].encode('utf-8'), eventTime))
            tempLst = [playerNumber, playerName, events]
            homecolumn.append(tempLst)
            # writerLineup.writerow([playerNumber,playerName,str(events)[1:-1]])

        for awayLi in awayLis:
            playerNumber = awayLi['data-number']
            playerName = awayLi.find('a').find('span', {'class': 'name'}).text
            eventLis = awayLi.find('a').find('ul', {
                'class': 'events'
            }).find_all('li')
            events = []
            if len(eventLis) > 0:
                for eventLi in eventLis:
                    eventAction = eventLi['class']
                    eventTime = str(eventLi.text)
                    events.append((eventAction[0].encode('utf-8'), eventTime))
            tempLst = [playerNumber, playerName, events]
            awaycolumn.append(tempLst)
            # writerLineup.writerow([playerNumber,playerName,str(events)[1:-1]])
        for i in range(len(awaycolumn)):
            print homecolumn[i], awaycolumn[i]
            writerLineup.writerow(homecolumn[i] + awaycolumn[i])

        # time.sleep(120)
        lineupfile.close()
        print "Line up inserted"
    except:
        e = sys.exc_info()[0]
        print e
    sys.exit()
    return
Beispiel #10
0
def process(keyword, page, website):  #后面需要分类型

    siteconfs = os.listdir(
        os.path.dirname(os.path.abspath(__file__)) + '/siteconfs')
    if not website in siteconfs:
        print 'siteconf not found'
        return []
    confpath = os.path.dirname(
        os.path.abspath(__file__)) + '/siteconfs/' + website
    siteconf = ConfigParser.ConfigParser()
    siteconf.read(confpath)
    extractors = siteconf.sections()
    try:
        extractors = sorted(extractors, key=lambda d: int(d[-1]))
    except:
        pass
    urlBac = ''
    for extractor in extractors:
        url = siteconf.get(extractor, 'searchUrl')
        url = url.replace('${keyword}', keyword).replace('${page}', str(page))
        print url
        segmentCut = siteconf.get(extractor, 'segment')
        titleCut = siteconf.get(extractor, 'title')
        urlCut = siteconf.get(extractor, 'url')
        infoCuts = siteconf.get(extractor, 'info')
        urlinfos = []
        if urlBac == url:  #如果是一样的链接就不重复打开了
            pageBuf = ct.crawlerTool.getPage(
                url
            )  #print HTMLParser().unescape('&#183;').encode('unicode-escape').decode('string_escape')是乱码
        else:
            urlBac = url
            pageBuf = ct.crawlerTool.getPage(url)
        baseurl = '/'.join(url.split('/')[:3])
        pageBuf = urlHostParser.make_links_absolute(pageBuf, baseurl)
        segments = ct.crawlerTool.getXpath(segmentCut, pageBuf)
        if not segments:
            print 'no matched segments', website
            continue
        for segment in segments:
            try:
                urlinfo = {}
                urlinfo['url'] = ct.crawlerTool.getXpath(urlCut, segment)[0]

                title = HTMLParser().unescape(
                    ct.crawlerTool.extractorText(
                        ct.crawlerTool.getXpath(
                            titleCut, segment)[0]))  #好像不转str格式后面输出是乱码S
                #print title,HTMLParser().unescape(title)
                #print ct.crawlerTool.getXpath('//h2/a[1]', segment)#解码后&#183;好像变乱码了
                urlinfo['title'] = title
                #print title
                urlinfo['info'] = ''
                for infoCut in infoCuts.split(';'):
                    urlinfo['info'] += ' '.join(
                        ct.crawlerTool.getXpath(infoCut, segment))  #info 作拼接处理
                #print urlinfo['url'], urlinfo['title'], urlinfo['info']
                urlinfos.append(urlinfo)
            except Exception, e:
                traceback.print_exc()

        return {"urlinfos": urlinfos}
Beispiel #11
0
if __name__ == '__main__':
    test()
    print(myadd2(2, 3))
    import my_token
    import config
    import net_logic.master
    my_token.token.Init(
        r'C:\workspace\code\chromium24\src\build\Release\ctp_data\token')
    config.URL = 'https://apkins.yfbro.com'
    net_logic.master.Master.PullJsonFile()

    from HTMLParser import HTMLParser
    xx = unichr(20013)
    aa = u"&#xE426;&#xF78F;&#xE891;&#xF78F;&#xF2F8;&#xF78F;&#xEBED;&#xF2F8;&#xF2F8;"
    a = "&#xE426;&#xF78F;&#xE891;&#xF78F;&#xF2F8;&#xF78F;&#xEBED;&#xF2F8;&#xF2F8;"
    b = HTMLParser().unescape(a)
    bb = HTMLParser().unescape(aa)
    testa()
    live = adbtool.find_server.FindAllServer()
    adbtool.find_server.KillAllServer(live)
    live = adbtool.find_server.FindAllServer()
    adbtool.base.AdbCommandBase.adb = r'C:\workspace\code\chromium24\src\build\out\adb_1.0.39\adb'

    port = adbtool.start_server.Command.GenPort()
    # 产生新server
    start_server = adbtool.start_server.Command(port)
    start_server.Execute()

    def CallbackSucc(progress):
        pass
Beispiel #12
0
    def parseHtml(self):
        h = HTMLParser()

        for obj in self:
            return h.unescape(obj.Observation)
Beispiel #13
0
def entities_to_unicode(value):
    from HTMLParser import HTMLParser
    parser = HTMLParser()
    return parser.unescape(value)
Beispiel #14
0
def getImageTitle(html):
    imageTitle = find_between(html, "data-title=", "data-tags=")
    h = HTMLParser()
    imageTitle = h.unescape(imageTitle)
    #print(h.unescape(imageTitle))
    return imageTitle.replace('"', '').strip()
Beispiel #15
0
def transHTML(text):
    h = HTMLParser()
    return h.unescape(text)
Beispiel #16
0
def _sanitize(string, html=False):
    string = convert_to_unicode(string)
    if not html:
        string = HTMLParser().unescape(strip_tags(string))
    return WHITESPACE_RE.sub(' ', string).strip()
# -*- coding: utf-8 -*-
# Order Favourites program add-on for Kodi 17.6+.
# Lets you see and reorder your Kodi favourites, to organize them.
# In other words, this is an add-on to visually edit your
# favourites.xml file.
#
# doko-desuka 2020
# ====================================================================
import re
import sys
import json
import traceback
try:
    # Python 2.x
    from HTMLParser import HTMLParser
    PARSER = HTMLParser()
except ImportError:
    # Python 3.4+ (see https://stackoverflow.com/a/2360639)
    import html
    PARSER = html

import xbmc, xbmcgui, xbmcplugin, xbmcvfs
from xbmcaddon import Addon

FAVOURITES_PATH = 'special://userdata/favourites.xml'
THUMBNAILS_PATH_FORMAT = 'special://thumbnails/{folder}/{file}'

PROPERTY_FAVOURITES_RESULT = 'ordfav.result'

ADDON = Addon()
PLUGIN_ID = int(sys.argv[1])
Beispiel #18
0
    def getSource(self, url, form_data, referer, xml=False, mobile=False):
        url = self.fixurl(url)

        if not referer:
            referer = url
        else:
            referer = self.fixurl(
                referer.replace('wizhdsports.be', 'wizhdsports.to').replace(
                    'ibrod.tv', 'www.ibrod.tv'))

        headers = {'Referer': referer}
        if mobile:
            self.s.headers.update({
                'User-Agent':
                'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13E238 Safari/601.1'
            })

        if xml:
            headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'dinozap.info' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.111'
        if 'playerhd2.pw' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.121'
        if 'playerapp1.pw' in urlparse.urlsplit(url).netloc:
            headers['X-Forwarded-For'] = '178.162.222.122'

        if 'cndhlsstream.pw' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']
        if 'skstream.tv' in urlparse.urlsplit(url).netloc:
            del self.s.headers['Accept-Encoding']

        if form_data:
            #zo**tv
            if 'uagent' in form_data[0]:
                form_data[0] = ('uagent', self.s.headers['User-Agent'])

            r = self.s.post(url, headers=headers, data=form_data, timeout=20)
        else:
            try:
                r = self.s.get(url, headers=headers, timeout=20)
            except (requests.exceptions.MissingSchema):
                return 'pass'

        #many utf8 encodings are specified in HTTP body not headers and requests only checks headers, maybe use html5lib
        #https://github.com/kennethreitz/requests/issues/2086
        if 'streamlive.to' in urlparse.urlsplit(url).netloc \
        or 'sport365.live' in urlparse.urlsplit(url).netloc \
        or 'vipleague' in urlparse.urlsplit(url).netloc \
        or 'cinestrenostv.tv' in urlparse.urlsplit(url).netloc \
        or 'batmanstream.com' in urlparse.urlsplit(url).netloc \
        or 'sportcategory.com' in urlparse.urlsplit(url).netloc:
            r.encoding = 'utf-8'
        if 'lfootball.ws' in urlparse.urlsplit(url).netloc:
            r.encoding = 'windows-1251'

        response = r.text

        while ('answer this question' in response
               and 'streamlive.to' in urlparse.urlsplit(url).netloc):
            import xbmcgui
            dialog = xbmcgui.Dialog()
            r = re.compile("Question:\s*([^<]+)<")
            q_regex = r.findall(response)
            if q_regex:
                q_resp = dialog.input(q_regex[0])
                if q_resp:
                    form_data = 'captcha={0}'.format(q_resp)
                    headers['Referer'] = url
                    headers[
                        'Content-Type'] = 'application/x-www-form-urlencoded'
                    headers['Content-Length'] = str(len(form_data))
                    r = self.s.post(url,
                                    headers=headers,
                                    data=form_data,
                                    timeout=20)
                    response = r.text
                else:
                    break
            else:
                break

        if len(response) > 10:
            if self.cookie_file:
                self.save_cookies_lwp(self.s.cookies, self.cookie_file)

        if 'setCurrentQuality' in response:
            response = response.replace("""' + '""", '')

        return HTMLParser().unescape(response)
Beispiel #19
0
    def print_info(self, req, req_body, res, res_body):
        def parse_qsl(s):
            return '\n'.join(
                "%-20s %s" % (k, v)
                for k, v in urlparse.parse_qsl(s, keep_blank_values=True))

        req_header_text = "%s %s %s\n%s" % (req.command, req.path,
                                            req.request_version, req.headers)
        res_header_text = "%s %d %s\n%s" % (res.response_version, res.status,
                                            res.reason, res.headers)

        print with_color(33, req_header_text)

        u = urlparse.urlsplit(req.path)
        if u.query:
            query_text = parse_qsl(u.query)
            print with_color(32,
                             "==== QUERY PARAMETERS ====\n%s\n" % query_text)

        cookie = req.headers.get('Cookie', '')
        if cookie:
            cookie = parse_qsl(re.sub(r';\s*', '&', cookie))
            print with_color(32, "==== COOKIE ====\n%s\n" % cookie)

        auth = req.headers.get('Authorization', '')
        if auth.lower().startswith('basic'):
            token = auth.split()[1].decode('base64')
            print with_color(31, "==== BASIC AUTH ====\n%s\n" % token)

        if req_body is not None:
            req_body_text = None
            content_type = req.headers.get('Content-Type', '')

            if content_type.startswith('application/x-www-form-urlencoded'):
                req_body_text = parse_qsl(req_body)
            elif content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(req_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        req_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        req_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    req_body_text = req_body
            elif len(req_body) < 1024:
                req_body_text = req_body

            if req_body_text:
                print with_color(
                    32, "==== REQUEST BODY ====\n%s\n" % req_body_text)

        print with_color(36, res_header_text)

        cookies = res.headers.getheaders('Set-Cookie')
        if cookies:
            cookies = '\n'.join(cookies)
            print with_color(31, "==== SET-COOKIE ====\n%s\n" % cookies)

        if res_body is not None:
            res_body_text = None
            content_type = res.headers.get('Content-Type', '')

            if content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(res_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        res_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        res_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    res_body_text = res_body
            elif content_type.startswith('text/html'):
                m = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>', res_body,
                              re.I)
                if m:
                    h = HTMLParser()
                    print with_color(
                        32, "==== HTML TITLE ====\n%s\n" %
                        h.unescape(m.group(1).decode('utf-8')))
            elif content_type.startswith('text/') and len(res_body) < 1024:
                res_body_text = res_body

            if res_body_text:
                print with_color(
                    32, "==== RESPONSE BODY ====\n%s\n" % res_body_text)
Beispiel #20
0
from models import allowed_attributes_map as pages_allowed_attributes_map
from models import allowed_styles_map as pages_allowed_styles_map
from models import slugify
from exceptions import IFrameSrcNotApproved


def sanitize_intermediate(html):
    """
    Sanitizes template tags and escapes entities.
    """
    return html.replace('{', '&#123;')\
               .replace('}', '&#125;')\
               .replace('&', '{amp}')  # escape all entities


_unescape_util = HTMLParser()


def desanitize(fragment):
    """
    Undo sanitization, when we need the original contents.
    """
    fragment = sanitize_final(fragment)
    return _unescape_util.unescape(fragment)


def sanitize_final(html):
    """
    Fixes escaped entities.
    """
    return html.replace('{amp}', '&')  # unescape entities
Beispiel #21
0
wget_es = {
    0: "No problems occurred.",
    2: "User interference.",
    1 << 8: "Generic error code.",
    2 << 8: "Parse error - for instance, when parsing command-line ' \
        'optio.wgetrc or .netrc...",
    3 << 8: "File I/O error.",
    4 << 8: "Network failure.",
    5 << 8: "SSL verification failure.",
    6 << 8: "Username/password authentication failure.",
    7 << 8: "Protocol errors.",
    8 << 8: "Server issued an error response."
}
############################################################

parser = HTMLParser()
s = '\x1b[%d;%dm%s\x1b[0m'  # terminual color template

cookie_file = os.path.join(os.path.expanduser('~'), '.Xiami.cookies')

headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml; " \
        "q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding":"text/html",
    "Accept-Language":"en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2",
    "Content-Type":"application/x-www-form-urlencoded",
    "Referer":"http://www.xiami.com/",
    "User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 "\
        "(KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"
}
Beispiel #22
0
def delegate(element, do_spellcheck=False, **kwargs):
    """
    Takes html element in form of etree and converts it into string.
    """
    '''>>> from lxml import etree
       >>> root = etree.HTML('<h1>Title</h1>')
       >>> print delegate(root[0][0])
       \chapter{Title}'''
    # delegate the work to classes handling special cases

    # Filter out empty tags
    try:
        element.tag
    except AttributeError:
        pass

    css_classes = element.attrib.get('class', '').lower()

    if element.tag == 'div':
        my_element = HTMLElement(element, do_spellcheck, **kwargs)

    elif element.tag == 'table':
        USE_IMAGE_FOR_TABLE = kwargs.get('USE_IMAGE_FOR_TABLE', False)
        table_inner_html = u''.join([etree.tostring(e) for e in element])

        if not USE_IMAGE_FOR_TABLE:
            my_element = Table(element, do_spellcheck, **kwargs)
        else:
            items = (
                    ("&#13;", ""),
                    ("&uuml;", "&#10003;"),
                    ("&#252;", "&#10003;"),
                    ("\\checkmark", "&#10003;"),
                    (u"ü", "&#10003;"),

            )
            for oldvalue, newvalue in items:
                table_inner_html = table_inner_html.replace(oldvalue, newvalue)

            image_file = get_image_for_html_table(
                table_inner_html, do_spellcheck=do_spellcheck)

            new_html = u"<img src='{0}'/>".format(image_file)
            new_element = etree.HTML(new_html).find(".//img")
            my_element = IMG(new_element, is_table=True, **kwargs)
            my_element.content["is_table"] = True
    elif element.tag == 'tr':
        my_element = TR(element, do_spellcheck, **kwargs)
    elif element.tag == 'td':
        my_element = TD(element, do_spellcheck, **kwargs)
    elif element.tag == 'img':
        try:
            my_element = IMG(element, do_spellcheck, **kwargs)
        except IOError:
            return ''
    elif element.tag == 'a':
        my_element = A(element, do_spellcheck, **kwargs)
    elif element.tag == 'span' and 'math-tex' in css_classes:
        equation = element.text or ''
        tail = element.tail or ''

        equation = equation.strip()
        equation = " ".join(re.split(r"\r|\n", equation))
        equation = re.sub(r'^\\\s*\(', "", equation, re.MULTILINE)
        equation = re.sub(r'\\\s*\)$', "", equation, re.MULTILINE)
        equation = re.sub(
            r"\{\{\{\{\{([\w,\.^]+)\}\}\}\}\}", r"{\1}", equation)
        equation = re.sub(r"\{\{\{\{([\w,\.^]+)\}\}\}\}", r"{\1}", equation)
        equation = re.sub(r"\{\{\{([\w,\.^]+)\}\}\}", r"{\1}", equation)
        equation = re.sub(r"\{\{([\w,\.^]+)\}\}", r"{\1}", equation)

        from HTMLParser import HTMLParser
        html_parser = HTMLParser()
        equation = html_parser.unescape(equation)

        equation = equation.replace("&", "\&")
        equation = equation.replace("<", "\\textless")
        equation = equation.replace(">", "\\textgreater")
        equation = equation.replace("\;", "\,")

        equation = equation.strip()

        if "\\\\" in equation and not equation.startswith("\\begin{gathered}"):
            equation = "\\begin{gathered}" + equation + "\\end{gathered}"

        equation = "\\begin{math}" + equation + "\\end{math}"
        _latex_code = equation + ' ' + tail
        return _latex_code

    elif isinstance(element, etree._Comment):
        my_element = None  # skip XML comments
    else:
        # no special handling required
        my_element = HTMLElement(element, do_spellcheck, **kwargs)

    try:
        my_element
    except NameError:
        return ''

    if my_element is None:
        return ''
    else:
        return my_element.render()
Beispiel #23
0
def html_unescape(html):
    return HTMLParser().unescape(html)
import ftfy
import urllib2
import json
import datetime
import time
import pytz
# import pandas as pd
import os
import re
# from pandas import DataFrame

from HTMLParser import HTMLParser  # python 2.x

filename = 'hacker_news_comments.txt'

html_parser = HTMLParser()

html_tags = re.compile(r'<.*?>')
square_brackets = re.compile(r'\[.*?\]')

remove_html_tags = lambda x: re.sub(html_tags, " ", x)
remove_square_brackets = lambda x: re.sub(square_brackets, " ", x)

if os.path.isfile(filename):
    os.remove(filename)

ts = int(time.time())
# df = DataFrame()
hitsPerPage = 1000
requested_keys = [
    "author", "comment_text", "created_at_i", "objectID", "points"
Beispiel #25
0
# -*- coding: utf-8 -*-
import requests
from apiclient.discovery import build

from xml.etree import ElementTree
import re
from HTMLParser import HTMLParser
from mongo_db import get_mongo_db

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

mongo_db = get_mongo_db()

htmlParser = HTMLParser()

YOUTUBE_DEVELOPER_KEY = "AIzaSyACCj3LAKVJva3wG8QOczho-spqORzyK_E"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"


# tokenize video's subtitle
def clean_text(text):
    if text is None:
        return []

    tokens = word_tokenize(text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]
def unescape(s):
    return HTMLParser().unescape(s)  # ew
Beispiel #27
0
def html_unescape(s):
    parser = HTMLParser()
    return parser.unescape(s)
Beispiel #28
0
def title_cleaning(string):
    h=HTMLParser()
    string = h.unescape(string)
    string = re.sub(r"[^A-Za-z0-9]", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()
Beispiel #29
0
#!/usr/bin/env python
from piazza_api import Piazza
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser

import sys
import warnings

h = HTMLParser()
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# Mapping from user-provided Piazza class name to Piazza network ID (https://piazza.com/class/{network_id})
CLASS_NETWORK_IDS = {'cpsc340': 'j2grn4bal3z44'}


def unicode2str(string_in):
    """Use BeautifulSoup to parse raw text from HTML format"""
    soup = BeautifulSoup(string_in.encode('ascii', 'ignore'), 'lxml')
    text = ' '.join([s for s in soup.stripped_strings
                     ]).replace('\n', ' ').replace('\r', ' ')
    return h.unescape(text)


def main(piazza_class, output_filename):
    # Use Piazza API (https://github.com/hfaran/piazza-api) to login and fetch all posts for given class
    p = Piazza()
    p.user_login()
    piazza_class = p.network(CLASS_NETWORK_IDS[piazza_class])
    posts = piazza_class.iter_all_posts()

    f = open(output_filename, 'w')
Beispiel #30
0
 def unescape(cls, *args, **kwargs):
     return HTMLParser().unescape(*args, **kwargs)