Beispiel #1
0
def listVideos(url):
    content = getUrl(url)
    contenttop = content[content.find('<div class="topStoryWrapper clear">'):]
    contenttop = contenttop[:contenttop.find('<div class="subcategoryList clear">')]
    titletop = contenttop[contenttop.find('<h2 class="topStoryTitle">'):]
    match = re.compile('<a href="(.+?)" title="(.+?)"', re.DOTALL).findall(titletop)
    url="http://"+language2+".euronews.com"+match[0][0]
    title=match[0][1]
    title=HTMLParser().unescape(title.decode('utf-8'))
    title=title.encode('utf-8')
    match = re.compile('src="(.+?)"', re.DOTALL).findall(titletop)
    thumb =  match[0]
    match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(titletop)
    desc=match[0]
    match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(contenttop)
    datum= match[0]
    debug("TITLE: " + title)
    debug("URL: " + url)
    addLink(datum +" - "+title, url, 'playVideo', thumb, desc)
    spl = content.split('<li class="clearAfter fixedHeight">')
    
    for i in range(1, len(spl), 1):   
        element=spl[i]
        match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(element)
        datum= match[0]
        debug("++++++++ "+ datum)
        sp2 = element.split('<a title="INSIDERS"')
        for i2 in range(0, len(sp2), 1):
            element=sp2[i2]
            debug("---------")
            debug(element)
            debug("---------")
            match = re.compile('href="([^"]+?)"[ ]+title="([^"]+?)"', re.DOTALL).findall(element)
            if not match:
               debug("Keine Url")
               continue
            url="http://"+language2+".euronews.com"+match[0][0]
            title=match[0][1]
            match = re.compile('src="(.+?)"', re.DOTALL).findall(element)
            if match:
              thumb =  match[0]            
            else:
               thump=""
            match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(element)
            if match:
                desc=match[0]
            else :
                 desc=""
            debug("URL :" + url)
            title=HTMLParser().unescape(title.decode('utf-8'))
            title=title.encode('utf-8')
            addLink( datum +" - "+title, url, 'playVideo', thumb, desc)
    xbmcplugin.endOfDirectory(pluginhandle)
    if forceViewMode == "true":
        xbmc.executebuiltin('Container.SetViewMode('+viewMode+')')
 def Items(self, url, amount='all'):
     #pull a set amount of podcasts or all if amount is not determined (podcasts name,image,description,publish date and playlink)  in rss feed
     ParseRSS = feedparser.parse(url)
     feed = ParseRSS.feed
     entries = ParseRSS.entries
     if str(amount).isdigit():
         amount = int(amount)
         for i in range(amount):
             try:
                 image = self._ImageResolve(entries[i].image)
             except:
                 image = self._ImageResolve(feed.image)
             playlink = self._PlayLinkResolve(entries[i].links)
             date = self._DateResolve(entries[i].published)
             try:
                 description = entries[i].description
                 description = HTMLParser().unescape(description)
                 description = self._DescriptionClean(description)
             except:
                 description = ''
             self.ITEM.append({
                 'title': entries[i].title.encode('utf8'),
                 'image': image,
                 'description': description.encode('utf8'),
                 'date': date,
                 'playlink': playlink
             })
     elif amount == 'all':
         for entry in entries:
             try:
                 image = self._ImageResolve(entry.image)
             except:
                 image = self._ImageResolve(feed.image)
             playlink = self._PlayLinkResolve(entry.links)
             date = self._DateResolve(entry.published)
             try:
                 description = entry.description
                 description = HTMLParser().unescape(description)
                 description = self._DescriptionClean(description)
             except:
                 description = ''
             self.ITEM.append({
                 'title': entry.title.encode('utf8'),
                 'image': image,
                 'description': description.encode('utf8'),
                 'date': date,
                 'playlink': playlink
             })
     else:
         pass
         koding.dolog(
             'Podcast RSS Feed Wrong value entered for amount must be int value entered ='
             + str(amount) + ' Url of rss = ' + str(url),
             line_info=True)
def _cleanTitle(title, html=True):
    if html:
        title = HTMLParser().unescape(title)
        if sys.version_info[0] < 3:  # for Python 2
            if isinstance(title, unicode):
                title = title.encode('utf-8')
        return title
    else:
        title = title.replace("&lt;", "<").replace("&gt;", ">").replace(
            "&amp;",
            "&").replace("&#034;", "\"").replace("&#039;", "'").replace(
                "&quot;", "\"").replace("&szlig;",
                                        "ß").replace("&ndash;", "-")
        title = title.replace("&Auml;", "Ä").replace("&Uuml;", "Ü").replace(
            "&Ouml;",
            "Ö").replace("&auml;", "ä").replace("&uuml;", "ü").replace(
                "&ouml;", "ö").replace("&eacute;",
                                       "é").replace("&egrave;", "è")
        title = title.replace("&#x00c4;", "Ä").replace(
            "&#x00e4;",
            "ä").replace("&#x00d6;", "Ö").replace("&#x00f6;", "ö").replace(
                "&#x00dc;", "Ü").replace("&#x00fc;",
                                         "ü").replace("&#x00df;", "ß")
        title = title.replace("&apos;", "'").strip()
        return title
Beispiel #4
0
    def get_data(self):
        try:
            parser = etree.XMLParser(recover=True, remove_blank_text=True)
            root = etree.parse(self.file_name, parser)
        except Exception as e:
            raise e

        comprobante = root.getroot()

        fecha = comprobante.get('fecha')
        if fecha: fecha = fecha.encode("utf-8")
        serie = comprobante.get('serie')
        if serie: serie = serie.encode("utf-8")
        folio = comprobante.get('folio')
        if folio: folio = folio.encode("utf-8")
        metodo_pago = comprobante.get('metodoDePago')
        if metodo_pago: metodo_pago = metodo_pago.encode("utf-8")
        num_cta = comprobante.get('NumCtaPago')
        if num_cta: num_cta = num_cta.encode("utf-8")
        emisor = comprobante.find('{http://www.sat.gob.mx/cfd/3}Emisor')
        razon_social = emisor.get('nombre')
        if razon_social: razon_social = razon_social.encode("utf-8")
        rfc = emisor.get('rfc')
        if rfc: rfc = rfc.encode("utf-8")
        subtotal = comprobante.get('subTotal')
        if subtotal: subtotal = subtotal.encode("utf-8")
        total = comprobante.get('total')
        if total: total = total.encode("utf-8")

        impuestos = comprobante.find('{http://www.sat.gob.mx/cfd/3}Impuestos')
        impuestos_traslados = impuestos.find(
            '{http://www.sat.gob.mx/cfd/3}Traslados')
        traslados = []
        iva = 0
        ieps = 0

        if impuestos_traslados is not None:
            for traslado in impuestos_traslados:
                if traslado.get('impuesto') == 'IVA' and traslado.get(
                        'tasa') in IVA:
                    iva = traslado.get('importe')
                if traslado.get('impuesto') == 'IEPS':
                    ieps = traslado.get('importe')

        comprobante_conceptos = comprobante.find(
            '{http://www.sat.gob.mx/cfd/3}Conceptos')
        data = []
        for concepto in comprobante_conceptos:
            descripcion = HTMLParser().unescape(concepto.get('descripcion'))
            if descripcion: descripcion = descripcion.encode("utf-8")
            importe = concepto.get('importe')
            if importe: importe = importe.encode("utf-8")
            concepto_object = [
                fecha, serie, folio, metodo_pago, num_cta, rfc, razon_social,
                descripcion, importe, iva, ieps, total
            ]
            data.append(concepto_object)

        return data
Beispiel #5
0
 def run(self):
     self.progressbar_show.emit(True)
     self.info_label.emit(translate("AddonsInstaller", "Retrieving description..."))
     if len(self.macros[self.idx]) > 2:
         desc = self.macros[self.idx][2]
         url = self.macros[self.idx][4]
     else:
         mac = self.macros[self.idx][0].replace(" ","_")
         mac = mac.replace("&","%26")
         mac = mac.replace("+","%2B")
         url = "https://www.freecadweb.org/wiki/Macro_"+mac
         self.info_label.emit("Retrieving info from " + str(url))
         if ctx:
             u = urllib2.urlopen(url,context=ctx)
         else:
             u = urllib2.urlopen(url)
         p = u.read()
         if sys.version_info.major >= 3 and isinstance(p, bytes):
             p = p.decode("utf-8")
         u.close()
         code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--"))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code,key=len)[-1]
             code = code.replace("--endl--","\n")
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
             self.progressbar_show.emit(False)
             self.stop = True
             return
         desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," "))
         if desc:
             desc = desc[0]
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
             desc = "No description available"
         # clean HTML escape codes
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         try:
             code = code.decode("utf8")
             code = HTMLParser().unescape(code)
             code = code.encode("utf8")
             code = code.replace("\xc2\xa0", " ")
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n")
         self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url])
     if self.macros[self.idx][1] == 1 :
         message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     else:
         message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     self.info_label.emit( message )
     self.progressbar_show.emit(False)
     self.stop = True
Beispiel #6
0
 def run(self):
     self.progressbar_show.emit(True)
     self.info_label.emit(translate("AddonsInstaller", "Retrieving description..."))
     if len(self.macros[self.idx]) > 2:
         desc = self.macros[self.idx][2]
         url = self.macros[self.idx][4]
     else:
         mac = self.macros[self.idx][0].replace(" ","_")
         mac = mac.replace("&","%26")
         mac = mac.replace("+","%2B")
         url = "https://www.freecadweb.org/wiki/Macro_"+mac
         self.info_label.emit("Retrieving info from " + str(url))
         if ctx:
             u = urllib2.urlopen(url,context=ctx)
         else:
             u = urllib2.urlopen(url)
         p = u.read()
         if sys.version_info.major >= 3 and isinstance(p, bytes):
             p = p.decode("utf-8")
         u.close()
         code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--"))
         if code:
             code = code[0]
             code = code.replace("--endl--","\n")
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
             self.progressbar_show.emit(False)
             self.stop = True
             return
         desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," "))
         if desc:
             desc = desc[0]
         else:
             self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
             desc = "No description available"
         # clean HTML escape codes
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         try:
             code = code.decode("utf8")
             code = HTMLParser().unescape(code)
             code = code.encode("utf8")
             code = code.replace("\xc2\xa0", " ")
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n")
         self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url])
     if self.macros[self.idx][1] == 1 :
         message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     else:
         message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     self.info_label.emit( message )
     self.progressbar_show.emit(False)
     self.stop = True
Beispiel #7
0
def retroclassic():
    #call scraper from folder
    from .scrapers import retrovision
    #run scraper
    retrovision.all_movies()
    #return list of dicts from scraper
    for items in retrovision.ReturnList:
        #HTMLPareser cleans up any text that is still in html code
        Description = HTMLParser().unescape(items.get('description', ''))
        Description = Description.encode('utf-8')
        BYB.addDir_file(ItemColor(items.get('title', '')),
                        items.get('playlink', ''), 902, items.get('icon', ''),
                        addon_fanart, Description, '', '', '')
    del retrovision.ReturnList[:]
Beispiel #8
0
 def fill_details_from_wiki(self, url):
     try:
         if ctx:
             u = urllib2.urlopen(url, context=ctx)
         else:
             u = urllib2.urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
     if code:
         # code = code[0]
         # take the biggest code block
         code = sorted(code, key=len)[-1]
         code = code.replace('--endl--', '\n')
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to fetch the code of this macro."))
     # Clean HTML escape codes.
     try:
         from HTMLParser import HTMLParser
     except ImportError:
         from html.parser import HTMLParser
     try:
         code = code.decode('utf8')
         code = HTMLParser().unescape(code)
         code = code.encode('utf8')
         code = code.replace('\xc2\xa0', ' ')
     except:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller", "Unable to clean macro code: ") +
             mac + '\n')
     desc = re.findall(
         "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
         p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #9
0
    def process(self, file_data):
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            file_mime_type = m.id_buffer(file_data['contents'])
        metadata = {}
        if file_mime_type == 'text/plain':
            text = file_data['contents']
        elif file_mime_type == 'text/html':
            text = parse_html(file_data['contents'], True, ['script', 'style'])
        elif file_mime_type == 'application/pdf':
            text, metadata = extract_pdf(file_data['contents'])
        else:
            # If we can't detect the mimetype we add a flag that can be read by
            # the frontend to provide more information on why the document
            # wasn't processed.
            # XXX: We're returning an empty text because if we don't the
            # pipeline will run indefinitely. The right approach is to make
            # pypelinin understand an specific exception (something like
            # StopPipeline) as a signal to stop processing this pipeline.
            return {
                'mimetype': 'unknown',
                'text': "",
                'file_metadata': {},
                'language': ""
            }

        text, forced_decoding = trial_decode(text)

        if isinstance(text, unicode):
            # HTMLParser only handles unicode objects. We can't pass the text
            # through it if we don't know the encoding, and it's possible we
            # also shouldn't. There's no way of knowing if it's a badly encoded
            # html or a binary blob that happens do have bytes that look liked
            # html entities.
            text = HTMLParser().unescape(text)

        text = clean(text)

        if isinstance(text, unicode):
            language = cld.detect(text.encode('utf-8'))[1]
        else:
            language = cld.detect(text)[1]

        return {
            'text': text,
            'file_metadata': metadata,
            'language': language,
            'mimetype': file_mime_type,
            'forced_decoding': forced_decoding
        }
Beispiel #10
0
 def fill_details_from_wiki(self, url):
     try:
         if ctx:
             u = urllib2.urlopen(url, context=ctx)
         else:
             u = urllib2.urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
     if code:
         # code = code[0]
         # take the biggest code block
         code = sorted(code, key=len)[-1]
         code = code.replace('--endl--', '\n')
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
     # Clean HTML escape codes.
     try:
         from HTMLParser import HTMLParser
     except ImportError:
         from html.parser import HTMLParser
     try:
         code = code.decode('utf8')
         code = HTMLParser().unescape(code)
         code = code.encode('utf8')
         code = code.replace('\xc2\xa0', ' ')
     except:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + mac + '\n')
     desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #11
0
def getKwl(url):
    '''
    
    :param url: 
    :return: return kwl text
    '''

    info = getList(url)
    if info == False:
        return False

    kwl = ''
    for item in info:
        kwl += '    <so name="%s." artist="%s" album=""></so>\r\n' % (
            item[0].replace('"', ''), item[1])
    kwl = '<so>\r\n%s</so>' % kwl

    kwl = HTMLParser().unescape(kwl)
    kwl = kwl.encode('gb2312', errors='ignore')
    return kwl
Beispiel #12
0
    def process(self, file_data):
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            file_mime_type = m.id_buffer(file_data['contents'])
        metadata = {}
        if file_mime_type == 'text/plain':
            text = file_data['contents']
        elif file_mime_type == 'text/html':
            text = parse_html(file_data['contents'], True, ['script', 'style'])
        elif file_mime_type == 'application/pdf':
            text, metadata = extract_pdf(file_data['contents'])
        else:
            # If we can't detect the mimetype we add a flag that can be read by
            # the frontend to provide more information on why the document
            # wasn't processed.
            # XXX: We're returning an empty text because if we don't the
            # pipeline will run indefinitely. The right approach is to make
            # pypelinin understand an specific exception (something like
            # StopPipeline) as a signal to stop processing this pipeline.
            return {'mimetype': 'unknown', 'text': "",
                    'file_metadata': {}, 'language': ""}

        text, forced_decoding = trial_decode(text)

        if isinstance(text, unicode):
            # HTMLParser only handles unicode objects. We can't pass the text
            # through it if we don't know the encoding, and it's possible we
            # also shouldn't. There's no way of knowing if it's a badly encoded
            # html or a binary blob that happens do have bytes that look liked
            # html entities.
            text = HTMLParser().unescape(text)

        text = clean(text)

        if isinstance(text, unicode):
            language = cld.detect(text.encode('utf-8'))[1]
        else:
            language = cld.detect(text)[1]

        return {'text': text, 'file_metadata': metadata, 'language': language,
                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
Beispiel #13
0
def extract_tweets(raw_html):
	''' This function scrapes the specified HTML string for Tweets and some related information.
	    Returns a list of lists(username, friendly_time, timestamp, tweet_text). '''
		
	if (len(raw_html)==0): raise TypeError("No raw_html specified");
		
	#Set up some temporary and holding variables for later
	retrieved_tweets = []; active_tweet= []; to_append="";

	#Query for username UNION time UNION timestamp UNION text
	xpath_query = "//span[starts-with(@class,'username')] | //small[@class = 'time']/a/@title | //span[starts-with(@class, '_timestamp')]/@data-time-ms | //p[contains(@class,'js-tweet-text')]"
	tree = html.fromstring(raw_html)
	query_results = tree.xpath(xpath_query)

	#Walk through query results
	for q in query_results:
		#We can extract all elements directly, EXCEPT for tweet text, because that's not an actual text element yet
		#See http://stackoverflow.com/questions/29398751 for why we query it like this (it's because of formatting)
		if (type(q) is lxml.html.HtmlElement):
			to_append = q.text_content()
		else: to_append = q;

		#Clean the extracted element up a little, make sure it's UTF-8 encoded and contains no linebreaks
		to_append = HTMLParser().unescape(to_append)
		to_append = to_append.encode('utf-8', errors='replace')
		to_append = to_append.replace('\n', ' ')
		to_append = to_append.replace(';', ',')

		#Append the cleaned-up string to the active element
		active_tweet.append(to_append)

		#Each tweet item contains (username, time, timestamp, text), so:
		#if we have reached a length of 4, the current item is finished and can be appended to the result set
		if (len(active_tweet) == 4):
			retrieved_tweets.append(active_tweet)
			active_tweet = []

	#Once we've walked through all query elements, the analysis is finished and we return the list-of-lists
	return retrieved_tweets
Beispiel #14
0
def clear_string(name):
    """
    Convert all the &# codes to char, remove extra-space and normalize
    :param name: string to convert
    :type name: object
    :return: converted string
    """
    from HTMLParser import HTMLParser
    if type(name) is not unicode:
        name = name.__str__()
    if type(name) is str:
        try:
            name = name.decode('utf-8')

        except:
            name = unicode(name, 'utf-8', errors='replace')

    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name)
    if type(name) is not str:
        name = name.encode('utf-8')

    return name
def multiple_videos_playlists(params):
    _id = params.id
    title = HTMLParser().unescape(params.name)
    if type(title) == unicode:
        title = title.encode('utf-8')
    url = build_url('posts', {'include': _id})
    i = requests.get(url).json()[0]
    content = i.get('content')['rendered']
    yt_playlists, yt_videos = parse_ids_content(content)
    listing = []
    for index, playlist in enumerate(yt_playlists):
        yt_pid, yt_vid = playlist
        image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid)
        listing.append({
            'label':
            "{0} {1}".format(_("Playlist"), index + 1),
            'thumb':
            image,
            'is_playable':
            False,
            'url':
            "plugin://plugin.video.youtube/playlist/{0}/".format(yt_pid),
        })
    for index, yt_vid in enumerate(yt_videos):
        image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid)
        listing.append({
            'label':
            "{0} {1}".format(_("Video"), index + 1),
            'thumb':
            image,
            'is_playable':
            True,
            'url':
            plugin.get_url(action='play', youtube_id=yt_vid, name=title),
        })
    return listing
def list_videos(
    url,
    enable_bookmark=True,
):
    json_data = requests.get(url).json()
    listing = []
    for i in json_data:
        _id = i.get('id')
        title = HTMLParser().unescape(i.get('title')['rendered'])
        if type(title) == unicode:
            title = title.encode('utf-8')
        content = i.get('content')['rendered']
        date = i.get('date')[:10]
        slug = i.get('slug')
        soup = bs4.BeautifulSoup(content, 'html5lib')
        try:
            plot = soup.find('meta', {'itemprop': 'description'})['content']
        except TypeError:
            plot = ""
        yt_playlists, yt_videos = parse_ids_content(content)
        context_menu = []
        if enable_bookmark:
            context_menu.append(
                (_("Add to Bookmarks"), 'XBMC.RunPlugin({0})'.format(
                    plugin.get_url(action='add_bookmark', id=_id))), )
        context_menu += [
            (_("Show tags"), 'XBMC.Container.Update({0})'.format(
                plugin.get_url(action='tags_by_post', id=_id))),
            (_("Show categories"), 'XBMC.Container.Update({0})'.format(
                plugin.get_url(action='categories_by_post', id=_id))),
            #(_("Force mirror search"),
            #'XBMC.RunPlugin({0})'.format(plugin.get_url(action='force_mirror', name=title))),
        ]
        if len(yt_videos) == 0 and len(yt_playlists) == 0:  # search for mirror
            image = "http://dokustreams.de/wp-content/uploads/{0}.jpg".format(
                slug)
            listing.append({
                'label': title,
                'thumb': image,
                'info': {
                    'video': {
                        'title': title,
                        'plot': plot,
                        'aired': date,
                        'year': date[:4],
                    }
                },
                'context_menu': context_menu,
                'is_playable': True,
                'url': plugin.get_url(action='play', name=title),
            })
        elif len(yt_videos) == 1 and len(
                yt_playlists) == 0:  # video direkt starten
            yt_vid = yt_videos[0]
            image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid)
            listing.append({
                'label':
                title,
                'thumb':
                image,
                'info': {
                    'video': {
                        'title': title,
                        'plot': plot,
                        'aired': date,
                        'year': date[:4],
                    }
                },
                'context_menu':
                context_menu,
                'is_playable':
                True,
                'url':
                plugin.get_url(action='play', youtube_id=yt_vid, name=title),
            })
        elif len(yt_videos) == 0 and len(
                yt_playlists) == 1:  # playlist direkt anzeigen
            yt_pid, yt_vid = yt_playlists[0]
            image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid)
            listing.append({
                'label':
                title,
                'thumb':
                image,
                'info': {
                    'video': {
                        'title': title,
                        'plot': plot,
                        'aired': date,
                        'year': date[:4],
                    }
                },
                'context_menu':
                context_menu,
                'is_playable':
                False,
                'url':
                "plugin://plugin.video.youtube/playlist/{0}/".format(yt_pid),
            })
        else:  # playlist und videos zusammen anzeigen
            image = "http://dokustreams.de/wp-content/uploads/{0}.jpg".format(
                slug)
            listing.append({
                'label':
                title,
                'thumb':
                image,
                'context_menu':
                context_menu,
                'is_playable':
                False,
                'url':
                plugin.get_url(action='multiple_videos_playlists',
                               id=_id,
                               name=title),
            })
    if len(json_data) == PER_PAGE:
        next_page = page_from_url(url) + 1
        next_url = edit_url(url, {'page': next_page})
        listing.append({
            'label':
            '[COLOR blue]{0}[/COLOR]'.format(_("Next page")),
            'url':
            plugin.get_url(action='posts_by_url', url=next_url),
        })
    return listing
Beispiel #17
0
    '%d0%b3%d1%80%d1%8b%d0%bd%d1%8c',                                     # Gryn`          178 OKc
    '%D0%BB%D0%B5%D1%89%D1%83%D0%BA',                                     # Leschuk       3929 OKc
    '%D0%9A%D0%B0%D0%BC%D0%BB%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE',       # Kamlichenko      5 OKc
    '%D0%9A%D0%BE%D0%BC%D0%BB%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE',       # Komlichenko    440 OKc
    '%D0%9A%D0%B0%D0%BB%D0%B5%D0%BD%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE', # Kalenichenko  2019 OKc
    '%D0%9A%D0%B0%D0%BB%D0%B8%D0%BD%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE'] # Kalinichenko 20k
    # Nadijka, Nadejka 0; Abazovka - net poiska po nasel punktu
    # Moroz Vadim 200 chel, Andr 994, vlad 2400; pavel 400, mixail 1200,maks 200,roman 407, anat 1208, evg 401, taisia 71, Lubov 1001, tatyana 2000, dmit 700

  k0=1; link0='http://nomerorg.com/allukraina/lastName_'+name_list[k0]+'_pagenumber_';
  nm=1+int(65645/15);
  k_proxy=0; u.install_opener(u.build_opener(u.ProxyHandler({'http': proxy_list[k_proxy]})))
  for k in range(877,nm):
    link1=link0+str(k)+'.html'; flag1=False; fd=open(dir0+'name'+str(k0)+'_'+str(10000+k)+'_.csv','w'); print k,k_proxy
    while not flag1:
      try:    url1=u.urlopen(u.Request(link1,headers=url_hdr),timeout=3); page1=html.fragments_fromstring(url1.read()); flag1=(1<len(page1));
      except: k_proxy+=1; u.install_opener(u.build_opener(u.ProxyHandler({'http': proxy_list[k_proxy]}))); print k,k_proxy # print HTMLParser().unescape(u.unquote(page1[0]))
    o=html.tostring(page1[3]); o=o[:o.find('</table>')-10].replace('</td><td>',',').replace('</td></tr><tr><td>','\n');
    o=o[o.rfind('/th></tr><tr><td>')+17:]+'\n'; o=HTMLParser().unescape(u.unquote(o));
    if -1<o.find('adsbygoogle'): break
    fd.write(o.encode('utf-16')); time.sleep(.1); fd.close();
  # Forbidden (blocked py queries)/ connection refused (try again)
  #--- consolidate files
  k=1; o1=glob.glob(dir0+'name1/name'+str(k)+'_1*.csv')
  fd=open(dir0+'name'+str(k)+'.csv','wb');
  for q in o1: fd_in=open(q,'rb'); o=fd_in.read(); fd_in.close(); fd.write(o);
  fd.close();


      
      
Beispiel #18
0
def find_dir(cco):
    f = { 'q' : cco.encode('utf-8') }
    u = dir_server + urllib.urlencode(f)
    r = None
    try:
        s = requests.Session()
        r=s.get(u)
        print(str(r.text.encode('utf-8')))
        headers={'Content-type': 'application/x-www-form-urlencoded'}
        data="userid="+dir_user+"&password="******"&target=&smauthreason=&smquerydata=&smagentname=&postpreservationdata=&SMENC=&SMLOCALE="
        r=s.post(sso_url,data,headers)
    except requests.exceptions.ConnectionError:
        return "Connection error to directory server"
    try: 
        from BeautifulSoup import BeautifulSoup
        from HTMLParser import HTMLParser
    except ImportError:
        from bs4 import BeautifulSoup
        from html.parser import HTMLParser
    html = HTMLParser().unescape(r.text)
    sys.stderr.write("html: "+str(html.encode('utf-8'))+"\n")
    parsed_html = BeautifulSoup(html)
    table=parsed_html.body.find('table', attrs={'id':'resultsTable'})
    if table is not None:
        result_list=[unicodedata.normalize('NFKD',i.text) for i in table.findAll('a',attrs={'class':'hover-link'})]
        found=False
        for n in result_list:
            m = re.search(r"\(([A-Za-z0-9]+)\)", n)
            if m.group(1) == cco:
                u=dir_detail_server+cco
                r=s.get(u)
                print(r.text)
                html = HTMLParser().unescape(r.text)
                sys.stderr.write("html: "+str(html.encode('utf-8'))+"\n")
                parsed_html = BeautifulSoup(html)
                found=True
                print("Found!")
        if not found:
            txt="Are you looking for one of these people:"
            for i in result_list:
                txt+="\n * "+str(i)
            return txt
    name=parsed_html.body.find('h2', attrs={'class':'userName'})
    sys.stderr.write("name: "+str(name)+"\n")
    if not hasattr(name, 'text'):
        return "CCO id not found !"
    else:
        tmp=parsed_html.body.find('p', attrs={'class':'userId'})
        print("tmp: "+str(tmp))
        m=re.search(r"\(([A-Za-z0-9]+)\)", str(tmp))
        print("m: "+str(m))
        real_cco=str(m.group(1))
        sys.stderr.write("real_cco: "+str(real_cco)+"\n")
        title=parsed_html.body.find('p', attrs={'class':'des'})
        sys.stderr.write("title: "+str(title)+"\n")
        manager=parsed_html.body.find('a', attrs={'class':'hover-link'})
        sys.stderr.write("manager: "+str(manager)+"\n")
        phone_text=""
        phone=parsed_html.body.find('div', attrs={'id':'dir_phone_links'})
        if phone is not None:
            for p in phone.findAll('p'):
                if p.text.find("Work") > -1 or p.text.find("Mobile") > -1 :
                    phone_text+=str(p.text)+"<br>"
        u = str(parsed_html.body.find('div',attrs={'class':'profImg'}).find('img')['src'])
        response = requests.get(u, stream=True)
        encoded_string = base64.b64encode(response.raw.read())
        return name.text+"<br>;"+title.text.replace('.',' ')+"<br>;"+manager.text+"<br>;"+phone_text+";"+encoded_string+";"+"<a href=\"http://wwwin-tools.cisco.com/dir/details/"+real_cco+"\">directory link</a>"
Beispiel #19
0
                self.printlog("Connection successful.")
            except Exception, e:  # if fails, retry
                self.printlog(
                    "Connection failed: %s (retrying in 30 seconds)" % e)
                time.sleep(30)
                self.printlog("Retrying... (%s retries left)" % retry['count'])

        if not page:
            funcs.printlog(
                "Connection failed after %s retries. Please check your internet connection. Exiting..."
                % retry['max'])
            sys.exit(1)

        page = page.decode('utf-8')
        page = HTMLParser().unescape(page)
        page = page.encode('utf-8')

        return page

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []

        def handle_data(self, d):
            self.fed.append(d)

        def get_data(self):
            return ''.join(self.fed)

    def strip_tags(self, html):
class FieldStorage:

    """Store a sequence of fields, reading multipart/form-data.

    This class provides naming, typing, files stored on disk, and
    more.  At the top level, it is accessible like a dictionary, whose
    keys are the field names.  (Note: None can occur as a field name.)
    The items are either a Python list (if there's multiple values) or
    another FieldStorage or MiniFieldStorage object.  If it's a single
    object, it has the following attributes:

    name: the field name, if specified; otherwise None

    filename: the filename, if specified; otherwise None; this is the
        client side filename, *not* the file name on which it is
        stored (that's a temporary file you don't deal with)

    value: the value as a *string*; for file uploads, this
        transparently reads the file every time you request the value

    file: the file(-like) object from which you can read the data;
        None if the data is stored a simple string

    type: the content-type, or None if not specified

    type_options: dictionary of options specified on the content-type
        line

    disposition: content-disposition, or None if not specified

    disposition_options: dictionary of corresponding options

    headers: a dictionary(-like) object (sometimes rfc822.Message or a
        subclass thereof) containing *all* headers

    The class is subclassable, mostly for the purpose of overriding
    the make_file() method, which is called internally to come up with
    a file open for reading and writing.  This makes it possible to
    override the default choice of storing all files in a temporary
    directory and unlinking them as soon as they have been opened.

    """

    def __init__(self, fp=None, headers=None, outerboundary="",
                 environ=os.environ, keep_blank_values=0, strict_parsing=0):
        """Constructor.  Read multipart/* until last part.

        Arguments, all optional:

        fp              : file pointer; default: sys.stdin
            (not used when the request method is GET)

        headers         : header dictionary-like object; default:
            taken from environ as per CGI spec

        outerboundary   : terminating multipart boundary
            (for internal use only)

        environ         : environment dictionary; default: os.environ

        keep_blank_values: flag indicating whether blank values in
            percent-encoded forms should be treated as blank strings.
            A true value indicates that blanks should be retained as
            blank strings.  The default false value indicates that
            blank values are to be ignored and treated as if they were
            not included.

        strict_parsing: flag indicating what to do with parsing errors.
            If false (the default), errors are silently ignored.
            If true, errors raise a ValueError exception.

        """
        method = 'GET'
        self.keep_blank_values = keep_blank_values
        self.strict_parsing = strict_parsing
        if 'REQUEST_METHOD' in environ:
            method = environ['REQUEST_METHOD'].upper()
        self.qs_on_post = None
        if method == 'GET' or method == 'HEAD':
            if 'QUERY_STRING' in environ:
                qs = environ['QUERY_STRING']
            elif sys.argv[1:]:
                qs = sys.argv[1]
            else:
                qs = ""
            fp = StringIO(qs)
            if headers is None:
                headers = {'content-type':
                           "application/x-www-form-urlencoded"}
        if headers is None:
            headers = {}
            if method == 'POST':
                # Set default content-type for POST to what's traditional
                headers['content-type'] = "application/x-www-form-urlencoded"
            if 'CONTENT_TYPE' in environ:
                headers['content-type'] = environ['CONTENT_TYPE']
            if 'QUERY_STRING' in environ:
                self.qs_on_post = environ['QUERY_STRING']
            if 'CONTENT_LENGTH' in environ:
                headers['content-length'] = environ['CONTENT_LENGTH']
        self.fp = fp or sys.stdin
        self.headers = headers
        self.outerboundary = outerboundary

        # Process content-disposition header
        cdisp, pdict = "", {}
        if 'content-disposition' in self.headers and rfc6266:
            cd = rfc6266.parse_headers(self.headers['content-disposition'], relaxed=True)
            cdisp, pdict = cd.disposition, cd.assocs
        elif 'content-disposition' in self.headers:
            cdisp, pdict = parse_header(self.headers['content-disposition'])
        self.disposition = cdisp
        self.disposition_options = pdict
        self.name = None
        if 'name' in pdict:
            self.name = pdict['name']
        self.filename = None
        if 'filename' in pdict:
            self.filename = pdict['filename']
        if 'filename*' in pdict:
            self.filename = pdict['filename*'].string
        if self.filename and '&' in self.filename:
            from HTMLParser import HTMLParser
            self.filename = HTMLParser().unescape(self.filename)
        if isinstance(self.filename, unicode):
            self.filename = self.filename.encode('utf8')

        # Process content-type header
        #
        # Honor any existing content-type header.  But if there is no
        # content-type header, use some sensible defaults.  Assume
        # outerboundary is "" at the outer level, but something non-false
        # inside a multi-part.  The default for an inner part is text/plain,
        # but for an outer part it should be urlencoded.  This should catch
        # bogus clients which erroneously forget to include a content-type
        # header.
        #
        # See below for what we do if there does exist a content-type header,
        # but it happens to be something we don't understand.
        if 'content-type' in self.headers:
            ctype, pdict = parse_header(self.headers['content-type'])
        elif self.outerboundary or method != 'POST':
            ctype, pdict = "text/plain", {}
        else:
            ctype, pdict = 'application/x-www-form-urlencoded', {}
        self.type = ctype
        self.type_options = pdict
        self.innerboundary = ""
        if 'boundary' in pdict:
            self.innerboundary = pdict['boundary']
        clen = -1
        if 'content-length' in self.headers:
            try:
                clen = int(self.headers['content-length'])
            except ValueError:
                pass
            if maxlen and clen > maxlen:
                raise ValueError, 'Maximum content length exceeded'
        self.length = clen

        self.list = self.file = None
        self.done = 0
        if ctype == 'application/x-www-form-urlencoded':
            self.read_urlencoded()
        elif ctype[:10] == 'multipart/':
            self.read_multi(environ, keep_blank_values, strict_parsing)
        else:
            self.read_single()

    def __repr__(self):
        """Return a printable representation."""
        return "FieldStorage(%r, %r, %r)" % (
                self.name, self.filename, self.value)

    def __iter__(self):
        return iter(self.keys())

    def __getattr__(self, name):
        if name != 'value':
            raise AttributeError, name
        if self.file:
            self.file.seek(0)
            value = self.file.read()
            self.file.seek(0)
        elif self.list is not None:
            value = self.list
        else:
            value = None
        return value

    def __getitem__(self, key):
        """Dictionary style indexing."""
        if self.list is None:
            raise TypeError, "not indexable"
        found = []
        for item in self.list:
            if item.name == key: found.append(item)
        if not found:
            raise KeyError, key
        if len(found) == 1:
            return found[0]
        else:
            return found

    def getvalue(self, key, default=None):
        """Dictionary style get() method, including 'value' lookup."""
        if key in self:
            value = self[key]
            if type(value) is type([]):
                return map(attrgetter('value'), value)
            else:
                return value.value
        else:
            return default

    def getfirst(self, key, default=None):
        """ Return the first value received."""
        if key in self:
            value = self[key]
            if type(value) is type([]):
                return value[0].value
            else:
                return value.value
        else:
            return default

    def getlist(self, key):
        """ Return list of received values."""
        if key in self:
            value = self[key]
            if type(value) is type([]):
                return map(attrgetter('value'), value)
            else:
                return [value.value]
        else:
            return []

    def keys(self):
        """Dictionary style keys() method."""
        if self.list is None:
            raise TypeError, "not indexable"
        return list(set(item.name for item in self.list))

    def has_key(self, key):
        """Dictionary style has_key() method."""
        if self.list is None:
            raise TypeError, "not indexable"
        return any(item.name == key for item in self.list)

    def __contains__(self, key):
        """Dictionary style __contains__ method."""
        if self.list is None:
            raise TypeError, "not indexable"
        return any(item.name == key for item in self.list)

    def __len__(self):
        """Dictionary style len(x) support."""
        return len(self.keys())

    def __nonzero__(self):
        return bool(self.list)

    def read_urlencoded(self):
        """Internal: read data in query string format."""
        qs = self.fp.read(self.length)
        if self.qs_on_post:
            qs += '&' + self.qs_on_post
        self.list = list = []
        for key, value in urlparse.parse_qsl(qs, self.keep_blank_values,
                                            self.strict_parsing):
            list.append(MiniFieldStorage(key, value))
        self.skip_lines()

    FieldStorageClass = None

    def read_multi(self, environ, keep_blank_values, strict_parsing):
        """Internal: read a part that is itself multipart."""
        ib = self.innerboundary
        if not valid_boundary(ib):
            raise ValueError, 'Invalid boundary in multipart form: %r' % (ib,)
        self.list = []
        if self.qs_on_post:
            for key, value in urlparse.parse_qsl(self.qs_on_post,
                                self.keep_blank_values, self.strict_parsing):
                self.list.append(MiniFieldStorage(key, value))
            FieldStorageClass = None

        klass = self.FieldStorageClass or self.__class__
        part = klass(self.fp, {}, ib,
                     environ, keep_blank_values, strict_parsing)
        # Throw first part away
        while not part.done:
            headers = rfc822.Message(self.fp)
            part = klass(self.fp, headers, ib,
                         environ, keep_blank_values, strict_parsing)
            self.list.append(part)
        self.skip_lines()

    def read_single(self):
        """Internal: read an atomic part."""
        if self.length >= 0:
            self.read_binary()
            self.skip_lines()
        else:
            self.read_lines()
        self.file.seek(0)

    bufsize = 8*1024            # I/O buffering size for copy to file

    def read_binary(self):
        """Internal: read binary data."""
        self.file = self.make_file('b')
        todo = self.length
        if todo >= 0:
            while todo > 0:
                data = self.fp.read(min(todo, self.bufsize))
                if not data:
                    self.done = -1
                    break
                self.file.write(data)
                todo = todo - len(data)

    def read_lines(self):
        """Internal: read lines until EOF or outerboundary."""
        self.file = self.__file = StringIO()
        if self.outerboundary:
            self.read_lines_to_outerboundary()
        else:
            self.read_lines_to_eof()

    def __write(self, line):
        if self.__file is not None:
            if self.__file.tell() + len(line) > 1000:
                self.file = self.make_file('')
                self.file.write(self.__file.getvalue())
                self.__file = None
        self.file.write(line)

    def read_lines_to_eof(self):
        """Internal: read lines until EOF."""
        while 1:
            line = self.fp.readline(1<<16)
            if not line:
                self.done = -1
                break
            self.__write(line)

    def read_lines_to_outerboundary(self):
        """Internal: read lines until outerboundary."""
        next = "--" + self.outerboundary
        last = next + "--"
        delim = ""
        last_line_lfend = True
        while 1:
            line = self.fp.readline(1<<16)
            if not line:
                self.done = -1
                break
            if delim == "\r":
                line = delim + line
                delim = ""
            if line[:2] == "--" and last_line_lfend:
                strippedline = line.strip()
                if strippedline == next:
                    break
                if strippedline == last:
                    self.done = 1
                    break
            odelim = delim
            if line[-2:] == "\r\n":
                delim = "\r\n"
                line = line[:-2]
                last_line_lfend = True
            elif line[-1] == "\n":
                delim = "\n"
                line = line[:-1]
                last_line_lfend = True
            elif line[-1] == "\r":
                # We may interrupt \r\n sequences if they span the 2**16
                # byte boundary
                delim = "\r"
                line = line[:-1]
                last_line_lfend = False
            else:
                delim = ""
                last_line_lfend = False
            self.__write(odelim + line)

    def skip_lines(self):
        """Internal: skip lines until outer boundary if defined."""
        if not self.outerboundary or self.done:
            return
        next = "--" + self.outerboundary
        last = next + "--"
        last_line_lfend = True
        while 1:
            line = self.fp.readline(1<<16)
            if not line:
                self.done = -1
                break
            if line[:2] == "--" and last_line_lfend:
                strippedline = line.strip()
                if strippedline == next:
                    break
                if strippedline == last:
                    self.done = 1
                    break
            last_line_lfend = line.endswith('\n')

    def make_file(self, binary=None):
        """Overridable: return a readable & writable file.

        The file will be used as follows:
        - data is written to it
        - seek(0)
        - data is read from it

        The 'binary' argument is unused -- the file is always opened
        in binary mode.

        This version opens a temporary file for reading and writing,
        and immediately deletes (unlinks) it.  The trick (on Unix!) is
        that the file can still be used, but it can't be opened by
        another process, and it will automatically be deleted when it
        is closed or when the current process terminates.

        If you want a more permanent file, you derive a class which
        overrides this method.  If you want a visible temporary file
        that is nevertheless automatically deleted when the script
        terminates, try defining a __del__ method in a derived class
        which unlinks the temporary files you have created.

        """
        import tempfile
        return tempfile.TemporaryFile("w+b")
 def fill_details_from_wiki(self, url):
     code = ""
     try:
         u = urlopen(url)
     except:
         print("AddonManager: Debug: unable to open URL", url)
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     # check if the macro page has its code hosted elsewhere, download if needed
     if "rawcodeurl" in p:
         rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">", p)
         if rawcodeurl:
             rawcodeurl = rawcodeurl[0]
             try:
                 u2 = urlopen(rawcodeurl)
             except:
                 print("AddonManager: Debug: unable to open URL",
                       rawcodeurl)
                 return
             # code = u2.read()
             # github is slow to respond... We need to use this trick below
             response = ""
             block = 8192
             #expected = int(u2.headers['content-length'])
             while 1:
                 #print("expected:",expected,"got:",len(response))
                 data = u2.read(block)
                 if not data:
                     break
                 if sys.version_info.major >= 3 and isinstance(data, bytes):
                     data = data.decode('utf-8')
                 response += data
             if response:
                 code = response
             u2.close()
     if not code:
         code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace('--endl--', '\n')
         else:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller",
                           "Unable to fetch the code of this macro."))
         # Clean HTML escape codes.
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         if sys.version_info.major < 3:
             code = code.decode('utf8')
         try:
             code = HTMLParser().unescape(code)
             code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ')
         except:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller", "Unable to clean macro code")
                 + ": " + code + '\n')
         if sys.version_info.major < 3:
             code = code.encode('utf8')
     desc = re.findall(
         "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
         p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(
             translate("AddonsInstaller",
                       "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #22
0
 def fill_details_from_wiki(self, url):
     code = ""
     try:
         u = urlopen(url)
     except urllib2.HTTPError:
         return
     p = u.read()
     if sys.version_info.major >= 3 and isinstance(p, bytes):
         p = p.decode('utf-8')
     u.close()
     # check if the macro page has its code hosted elsewhere, download if needed
     if "rawcodeurl" in p:
         rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">",p)
         if rawcodeurl:
             rawcodeurl = rawcodeurl[0]
             try:
                 u2 = urlopen(rawcodeurl)
             except urllib2.HTTPError:
                 return
             # code = u2.read()
             # github is slow to respond... We need to use this trick below
             response = ""
             block = 8192
             #expected = int(u2.headers['content-length'])
             while 1:
                 #print("expected:",expected,"got:",len(response))
                 data = u2.read(block)
                 if not data:
                     break
                 if sys.version_info.major >= 3 and isinstance(data, bytes):
                     data = data.decode('utf-8')
                 response += data
             if response:
                 code = response
             u2.close()
     if not code:
         code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--'))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace('--endl--', '\n')
         else:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro."))
         # Clean HTML escape codes.
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         if sys.version_info.major < 3:
             code = code.decode('utf8')
         try:
             code = HTMLParser().unescape(code)
             code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ')
         except:
             FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + code + '\n')
         if sys.version_info.major < 3:
             code = code.encode('utf8')
     desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' '))
     if desc:
         desc = desc[0]
     else:
         FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro."))
         desc = "No description available"
     self.desc = desc
     self.url = url
     self.code = code
     self.parsed = True
Beispiel #23
0
 #set file-level metadata
 for page in item.pages:
     fileID = itemID + '_' + page.id
     pagelabel = page.label
     pageRefURL = page.refurl
     #set transcription
     if (('full' in page.info) and page.info['full']):
         transcription = str(page.info['full'].encode('ascii', 'ignore'))
         transcription = HTMLParser().unescape(transcription)
     elif (('fula' in page.info) and page.info['fula']):
         transcription = str(page.info['fula'].encode('ascii', 'ignore'))
         transcription = HTMLParser().unescape(transcription)
     else:
         transcription = ''
     #skip if 'n/a' in transcription field
     if ((alias == 'cwd') and re.match('n/a', transcription.encode('ascii', 'ignore'))):
         pass
     else:
         #set transcription status
         if (transcription == ''):
             status = 'Not Started'
         elif ((alias == 'cwd') and re.match('reviewed', str(page.info['transc']))):
             status = 'Completed'
         else:
             status = 'Needs Review'
         # if alias == 'cookbooks':
             #url = tempdir + '/cookbooks_' + page.file.replace('jp2', 'jpg')
             # code below is for downloading and uploading to server
         if page.file[-3:] == 'jp2':
         # #download image to temp directory (downloads locally, move these to dropbox on server)
             #imagepath = page.imageurl
Beispiel #24
0
def list_videos(
    url,
    enable_bookmark=True,
):
    json_data = requests.get(url).json()
    listing = []
    for i in json_data:
        _id = i.get('id')
        title = HTMLParser().unescape(i.get('title')['rendered'])
        if type(title) == unicode:
            title = title.encode('utf-8')
        content = i.get('content')['rendered']
        date = i.get('date')[:10]
        yt_vurl = parse_ids_content(content)

        print "yt_vurl: %s" % yt_vurl
        if not yt_vurl:
            print content

        yt_regex = '(?:youtube(?:-nocookie)?\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})\W'
        yt_vid = re.findall(yt_regex, yt_vurl)[0]
        context_menu = []
        if enable_bookmark:
            context_menu.append(
                (_("Add to Bookmarks"), 'XBMC.RunPlugin({0})'.format(
                    plugin.get_url(action='add_bookmark', id=_id))), )
        context_menu += [
            (_("Show tags"), 'XBMC.Container.Update({0})'.format(
                plugin.get_url(action='tags_by_post', id=_id))),
            (_("Show categories"), 'XBMC.Container.Update({0})'.format(
                plugin.get_url(action='categories_by_post', id=_id))),
            #(_("Force mirror search"),
            #'XBMC.RunPlugin({0})'.format(plugin.get_url(action='force_mirror', name=title))),
        ]
        image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid)
        listing.append({
            'label':
            title,
            'thumb':
            image,
            'info': {
                'video': {
                    'title': title,
                    'aired': date,
                    'year': date[:4],
                }
            },
            'context_menu':
            context_menu,
            'is_playable':
            True,
            'url':
            plugin.get_url(action='play', youtube_id=yt_vid, name=title),
        })

    if len(json_data) == PER_PAGE:
        next_page = page_from_url(url) + 1
        next_url = edit_url(url, {'page': next_page})
        listing.append({
            'label':
            '[COLOR blue]{0}[/COLOR]'.format(_("Next page")),
            'url':
            plugin.get_url(action='posts_by_url', url=next_url),
        })
    return listing
 def node2str(self, xml_node):
     """XML Node to string"""
     xml_str = HTMLParser().unescape(str(xml_node))
     return xml_str.encode('utf-8') if isinstance(xml_str,
                                                  unicode) else xml_str
Beispiel #26
0
def addEntryToFeedDb(feedXML, feedDb, archiveEntry, cacheImages, rssToolDir, table, isFb):
	dbc = feedDb.cursor()
	post = ""
	if archiveEntry.has_key("content"):
		post=archiveEntry["content"]["content"]
	elif archiveEntry.has_key("summary"):
		post=archiveEntry["summary"]["content"]
	title = ""
	if archiveEntry.has_key("title"):
		title = HTMLParser().unescape(archiveEntry["title"])
	post = urllib.unquote(HTMLParser().unescape(post))
	url = ""
	if archiveEntry.has_key("alternate"):
		url = archiveEntry["alternate"][0]["href"]
	if title=="":
		if url=="":
			print "Warning: No title or URL!"
	#get ID for post by hashing title with date added to front
	hashstring = str(title.encode('ascii', 'ignore'))+str(url.replace('https://','http://').encode('ascii', 'ignore'))
	id = hashlib.sha224(hashstring).hexdigest()
	#check if post already exists in db and insert if it does not
	selectedRow = dbc.execute('SELECT * FROM '+table+' WHERE id=?', (id,)).fetchone()
	if selectedRow == None:
		if cacheImages==True:
			#setup images dir
			xmlfilename = feedXML.replace('http://','').replace('https://','').replace('/','_')
			if xmlfilename[-1]=='_':
				xmlfilename = xmlfilename[:-1]
			imagedir = rssToolDir+"feeds/"+xmlfilename;
			if os.path.exists(imagedir+'/images') == False:
				os.makedirs(imagedir+'/images')
			#get image URLs
			h=imgParse()
			h.clear()
			h.feed(post)
			imageLinks = h.imgLinks
			#download images, rename, and replace image URLs in posts
			j = 0
			for image in imageLinks:
				imagequoted = image
				image = urllib.unquote(image)
				targetfile = image.rpartition('/')[2]
				targetfile = str(j)
				#if image is from Facebook, handle specially
				sourceimage = image
				if isFb==True:
					fbimage = image
					fbimage = fbimage.replace('s130x130/','').replace('p130x130/','')
					fbimage = fbimage.replace('https://','http://')
					if "url=" in fbimage:
						fbimage = fbimage.split("url=")[1]
					sourceimage = fbimage
				imageType = downloadImage(sourceimage, imagedir+"/images/"+str(id)+'_'+targetfile)
				#add image info to db
				if imageType!="NotAnImage":
					if imageType!=None:
						imageQuery = (image, str(id)+'_'+targetfile+"."+imageType)
						selectedImage = dbc.execute('SELECT * FROM images WHERE original=?', (image,)).fetchone()
						if selectedImage == None:
							dbc.execute('INSERT INTO images VALUES (?,?)', imageQuery)
						else:
							print "Warning: Image "+image+" has already been cached."
					j = j+1
		#package post data into a row for db
		publishedTime = datetime.fromtimestamp(archiveEntry["published"])
		updatedTime = datetime.fromtimestamp(archiveEntry["updated"])
		postQuery = (id, title, url, publishedTime, updatedTime, post)
		dbc.execute('INSERT INTO '+table+' VALUES (?,?,?,?,?,?)', postQuery)
		feedDb.commit()
		print "Added post with ID "+str(id)+" to db table "+table
		return 0
	else:
		i = 0
		#print "Warning: Post with ID "+str(id)+" already exists in db."
		return 1
Beispiel #27
0
			try: # connect to page
				page = urllib2.urlopen(url) # open url
				page = page.read()
				self.printlog("Connection successful.")
			except Exception, e: # if fails, retry
				self.printlog("Connection failed: %s (retrying in 30 seconds)" % e)
				time.sleep(30)
				self.printlog("Retrying... (%s retries left)" % retry['count'])
	
		if not page:
			funcs.printlog("Connection failed after %s retries. Please check your internet connection. Exiting..." % retry['max'])
			sys.exit(1)

		page = page.decode('utf-8')
		page = HTMLParser().unescape(page)
		page = page.encode('utf-8')
	
		return page

	class MLStripper(HTMLParser):
		def __init__(self):
			self.reset()
			self.fed = []
		def handle_data(self, d):
			self.fed.append(d)
		def get_data(self):
			return ''.join(self.fed)

	def strip_tags(self, html):
		s = self.MLStripper()
		s.feed(html)