def listVideos(url): content = getUrl(url) contenttop = content[content.find('<div class="topStoryWrapper clear">'):] contenttop = contenttop[:contenttop.find('<div class="subcategoryList clear">')] titletop = contenttop[contenttop.find('<h2 class="topStoryTitle">'):] match = re.compile('<a href="(.+?)" title="(.+?)"', re.DOTALL).findall(titletop) url="http://"+language2+".euronews.com"+match[0][0] title=match[0][1] title=HTMLParser().unescape(title.decode('utf-8')) title=title.encode('utf-8') match = re.compile('src="(.+?)"', re.DOTALL).findall(titletop) thumb = match[0] match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(titletop) desc=match[0] match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(contenttop) datum= match[0] debug("TITLE: " + title) debug("URL: " + url) addLink(datum +" - "+title, url, 'playVideo', thumb, desc) spl = content.split('<li class="clearAfter fixedHeight">') for i in range(1, len(spl), 1): element=spl[i] match = re.compile('([0-9]+/[0-9]+ [0-9]+:[0-9]+) CET', re.DOTALL).findall(element) datum= match[0] debug("++++++++ "+ datum) sp2 = element.split('<a title="INSIDERS"') for i2 in range(0, len(sp2), 1): element=sp2[i2] debug("---------") debug(element) debug("---------") match = re.compile('href="([^"]+?)"[ ]+title="([^"]+?)"', re.DOTALL).findall(element) if not match: debug("Keine Url") continue url="http://"+language2+".euronews.com"+match[0][0] title=match[0][1] match = re.compile('src="(.+?)"', re.DOTALL).findall(element) if match: thumb = match[0] else: thump="" match = re.compile('<p>(.+?)</p>', re.DOTALL).findall(element) if match: desc=match[0] else : desc="" debug("URL :" + url) title=HTMLParser().unescape(title.decode('utf-8')) title=title.encode('utf-8') addLink( datum +" - "+title, url, 'playVideo', thumb, desc) xbmcplugin.endOfDirectory(pluginhandle) if forceViewMode == "true": xbmc.executebuiltin('Container.SetViewMode('+viewMode+')')
def Items(self, url, amount='all'): #pull a set amount of podcasts or all if amount is not determined (podcasts name,image,description,publish date and playlink) in rss feed ParseRSS = feedparser.parse(url) feed = ParseRSS.feed entries = ParseRSS.entries if str(amount).isdigit(): amount = int(amount) for i in range(amount): try: image = self._ImageResolve(entries[i].image) except: image = self._ImageResolve(feed.image) playlink = self._PlayLinkResolve(entries[i].links) date = self._DateResolve(entries[i].published) try: description = entries[i].description description = HTMLParser().unescape(description) description = self._DescriptionClean(description) except: description = '' self.ITEM.append({ 'title': entries[i].title.encode('utf8'), 'image': image, 'description': description.encode('utf8'), 'date': date, 'playlink': playlink }) elif amount == 'all': for entry in entries: try: image = self._ImageResolve(entry.image) except: image = self._ImageResolve(feed.image) playlink = self._PlayLinkResolve(entry.links) date = self._DateResolve(entry.published) try: description = entry.description description = HTMLParser().unescape(description) description = self._DescriptionClean(description) except: description = '' self.ITEM.append({ 'title': entry.title.encode('utf8'), 'image': image, 'description': description.encode('utf8'), 'date': date, 'playlink': playlink }) else: pass koding.dolog( 'Podcast RSS Feed Wrong value entered for amount must be int value entered =' + str(amount) + ' Url of rss = ' + str(url), line_info=True)
def _cleanTitle(title, html=True): if html: title = HTMLParser().unescape(title) if sys.version_info[0] < 3: # for Python 2 if isinstance(title, unicode): title = title.encode('utf-8') return title else: title = title.replace("<", "<").replace(">", ">").replace( "&", "&").replace(""", "\"").replace("'", "'").replace( """, "\"").replace("ß", "ß").replace("–", "-") title = title.replace("Ä", "Ä").replace("Ü", "Ü").replace( "Ö", "Ö").replace("ä", "ä").replace("ü", "ü").replace( "ö", "ö").replace("é", "é").replace("è", "è") title = title.replace("Ä", "Ä").replace( "ä", "ä").replace("Ö", "Ö").replace("ö", "ö").replace( "Ü", "Ü").replace("ü", "ü").replace("ß", "ß") title = title.replace("'", "'").strip() return title
def get_data(self): try: parser = etree.XMLParser(recover=True, remove_blank_text=True) root = etree.parse(self.file_name, parser) except Exception as e: raise e comprobante = root.getroot() fecha = comprobante.get('fecha') if fecha: fecha = fecha.encode("utf-8") serie = comprobante.get('serie') if serie: serie = serie.encode("utf-8") folio = comprobante.get('folio') if folio: folio = folio.encode("utf-8") metodo_pago = comprobante.get('metodoDePago') if metodo_pago: metodo_pago = metodo_pago.encode("utf-8") num_cta = comprobante.get('NumCtaPago') if num_cta: num_cta = num_cta.encode("utf-8") emisor = comprobante.find('{http://www.sat.gob.mx/cfd/3}Emisor') razon_social = emisor.get('nombre') if razon_social: razon_social = razon_social.encode("utf-8") rfc = emisor.get('rfc') if rfc: rfc = rfc.encode("utf-8") subtotal = comprobante.get('subTotal') if subtotal: subtotal = subtotal.encode("utf-8") total = comprobante.get('total') if total: total = total.encode("utf-8") impuestos = comprobante.find('{http://www.sat.gob.mx/cfd/3}Impuestos') impuestos_traslados = impuestos.find( '{http://www.sat.gob.mx/cfd/3}Traslados') traslados = [] iva = 0 ieps = 0 if impuestos_traslados is not None: for traslado in impuestos_traslados: if traslado.get('impuesto') == 'IVA' and traslado.get( 'tasa') in IVA: iva = traslado.get('importe') if traslado.get('impuesto') == 'IEPS': ieps = traslado.get('importe') comprobante_conceptos = comprobante.find( '{http://www.sat.gob.mx/cfd/3}Conceptos') data = [] for concepto in comprobante_conceptos: descripcion = HTMLParser().unescape(concepto.get('descripcion')) if descripcion: descripcion = descripcion.encode("utf-8") importe = concepto.get('importe') if importe: importe = importe.encode("utf-8") concepto_object = [ fecha, serie, folio, metodo_pago, num_cta, rfc, razon_social, descripcion, importe, iva, ieps, total ] data.append(concepto_object) return data
def run(self): self.progressbar_show.emit(True) self.info_label.emit(translate("AddonsInstaller", "Retrieving description...")) if len(self.macros[self.idx]) > 2: desc = self.macros[self.idx][2] url = self.macros[self.idx][4] else: mac = self.macros[self.idx][0].replace(" ","_") mac = mac.replace("&","%26") mac = mac.replace("+","%2B") url = "https://www.freecadweb.org/wiki/Macro_"+mac self.info_label.emit("Retrieving info from " + str(url)) if ctx: u = urllib2.urlopen(url,context=ctx) else: u = urllib2.urlopen(url) p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode("utf-8") u.close() code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--")) if code: # code = code[0] # take the biggest code block code = sorted(code,key=len)[-1] code = code.replace("--endl--","\n") else: self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro.")) self.progressbar_show.emit(False) self.stop = True return desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," ")) if desc: desc = desc[0] else: self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" # clean HTML escape codes try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser try: code = code.decode("utf8") code = HTMLParser().unescape(code) code = code.encode("utf8") code = code.replace("\xc2\xa0", " ") except: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n") self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url]) if self.macros[self.idx][1] == 1 : message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' else: message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' self.info_label.emit( message ) self.progressbar_show.emit(False) self.stop = True
def run(self): self.progressbar_show.emit(True) self.info_label.emit(translate("AddonsInstaller", "Retrieving description...")) if len(self.macros[self.idx]) > 2: desc = self.macros[self.idx][2] url = self.macros[self.idx][4] else: mac = self.macros[self.idx][0].replace(" ","_") mac = mac.replace("&","%26") mac = mac.replace("+","%2B") url = "https://www.freecadweb.org/wiki/Macro_"+mac self.info_label.emit("Retrieving info from " + str(url)) if ctx: u = urllib2.urlopen(url,context=ctx) else: u = urllib2.urlopen(url) p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode("utf-8") u.close() code = re.findall("<pre>(.*?)<\/pre>",p.replace("\n","--endl--")) if code: code = code[0] code = code.replace("--endl--","\n") else: self.info_label.emit(translate("AddonsInstaller", "Unable to fetch the code of this macro.")) self.progressbar_show.emit(False) self.stop = True return desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>",p.replace("\n"," ")) if desc: desc = desc[0] else: self.info_label.emit(translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" # clean HTML escape codes try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser try: code = code.decode("utf8") code = HTMLParser().unescape(code) code = code.encode("utf8") code = code.replace("\xc2\xa0", " ") except: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ")+mac+"\n") self.update_macro.emit(self.idx,self.macros[self.idx]+[desc,code,url]) if self.macros[self.idx][1] == 1 : message = "<strong>" + translate("AddonsInstaller", "<strong>This addon is already installed.") + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' else: message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>' self.info_label.emit( message ) self.progressbar_show.emit(False) self.stop = True
def retroclassic(): #call scraper from folder from .scrapers import retrovision #run scraper retrovision.all_movies() #return list of dicts from scraper for items in retrovision.ReturnList: #HTMLPareser cleans up any text that is still in html code Description = HTMLParser().unescape(items.get('description', '')) Description = Description.encode('utf-8') BYB.addDir_file(ItemColor(items.get('title', '')), items.get('playlink', ''), 902, items.get('icon', ''), addon_fanart, Description, '', '', '') del retrovision.ReturnList[:]
def fill_details_from_wiki(self, url): try: if ctx: u = urllib2.urlopen(url, context=ctx) else: u = urllib2.urlopen(url) except urllib2.HTTPError: return p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode('utf-8') u.close() code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--')) if code: # code = code[0] # take the biggest code block code = sorted(code, key=len)[-1] code = code.replace('--endl--', '\n') else: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to fetch the code of this macro.")) # Clean HTML escape codes. try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser try: code = code.decode('utf8') code = HTMLParser().unescape(code) code = code.encode('utf8') code = code.replace('\xc2\xa0', ' ') except: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to clean macro code: ") + mac + '\n') desc = re.findall( "<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' ')) if desc: desc = desc[0] else: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" self.desc = desc self.url = url self.code = code self.parsed = True
def process(self, file_data): with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(file_data['contents']) metadata = {} if file_mime_type == 'text/plain': text = file_data['contents'] elif file_mime_type == 'text/html': text = parse_html(file_data['contents'], True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(file_data['contents']) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document # wasn't processed. # XXX: We're returning an empty text because if we don't the # pipeline will run indefinitely. The right approach is to make # pypelinin understand an specific exception (something like # StopPipeline) as a signal to stop processing this pipeline. return { 'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': "" } text, forced_decoding = trial_decode(text) if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) text = clean(text) if isinstance(text, unicode): language = cld.detect(text.encode('utf-8'))[1] else: language = cld.detect(text)[1] return { 'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding }
def fill_details_from_wiki(self, url): try: if ctx: u = urllib2.urlopen(url, context=ctx) else: u = urllib2.urlopen(url) except urllib2.HTTPError: return p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode('utf-8') u.close() code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--')) if code: # code = code[0] # take the biggest code block code = sorted(code, key=len)[-1] code = code.replace('--endl--', '\n') else: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro.")) # Clean HTML escape codes. try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser try: code = code.decode('utf8') code = HTMLParser().unescape(code) code = code.encode('utf8') code = code.replace('\xc2\xa0', ' ') except: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + mac + '\n') desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' ')) if desc: desc = desc[0] else: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" self.desc = desc self.url = url self.code = code self.parsed = True
def getKwl(url): ''' :param url: :return: return kwl text ''' info = getList(url) if info == False: return False kwl = '' for item in info: kwl += ' <so name="%s." artist="%s" album=""></so>\r\n' % ( item[0].replace('"', ''), item[1]) kwl = '<so>\r\n%s</so>' % kwl kwl = HTMLParser().unescape(kwl) kwl = kwl.encode('gb2312', errors='ignore') return kwl
def process(self, file_data): with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(file_data['contents']) metadata = {} if file_mime_type == 'text/plain': text = file_data['contents'] elif file_mime_type == 'text/html': text = parse_html(file_data['contents'], True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(file_data['contents']) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document # wasn't processed. # XXX: We're returning an empty text because if we don't the # pipeline will run indefinitely. The right approach is to make # pypelinin understand an specific exception (something like # StopPipeline) as a signal to stop processing this pipeline. return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} text, forced_decoding = trial_decode(text) if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) text = clean(text) if isinstance(text, unicode): language = cld.detect(text.encode('utf-8'))[1] else: language = cld.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
def extract_tweets(raw_html): ''' This function scrapes the specified HTML string for Tweets and some related information. Returns a list of lists(username, friendly_time, timestamp, tweet_text). ''' if (len(raw_html)==0): raise TypeError("No raw_html specified"); #Set up some temporary and holding variables for later retrieved_tweets = []; active_tweet= []; to_append=""; #Query for username UNION time UNION timestamp UNION text xpath_query = "//span[starts-with(@class,'username')] | //small[@class = 'time']/a/@title | //span[starts-with(@class, '_timestamp')]/@data-time-ms | //p[contains(@class,'js-tweet-text')]" tree = html.fromstring(raw_html) query_results = tree.xpath(xpath_query) #Walk through query results for q in query_results: #We can extract all elements directly, EXCEPT for tweet text, because that's not an actual text element yet #See http://stackoverflow.com/questions/29398751 for why we query it like this (it's because of formatting) if (type(q) is lxml.html.HtmlElement): to_append = q.text_content() else: to_append = q; #Clean the extracted element up a little, make sure it's UTF-8 encoded and contains no linebreaks to_append = HTMLParser().unescape(to_append) to_append = to_append.encode('utf-8', errors='replace') to_append = to_append.replace('\n', ' ') to_append = to_append.replace(';', ',') #Append the cleaned-up string to the active element active_tweet.append(to_append) #Each tweet item contains (username, time, timestamp, text), so: #if we have reached a length of 4, the current item is finished and can be appended to the result set if (len(active_tweet) == 4): retrieved_tweets.append(active_tweet) active_tweet = [] #Once we've walked through all query elements, the analysis is finished and we return the list-of-lists return retrieved_tweets
def clear_string(name): """ Convert all the &# codes to char, remove extra-space and normalize :param name: string to convert :type name: object :return: converted string """ from HTMLParser import HTMLParser if type(name) is not unicode: name = name.__str__() if type(name) is str: try: name = name.decode('utf-8') except: name = unicode(name, 'utf-8', errors='replace') name = name.replace('<![CDATA[', '').replace(']]', '') name = HTMLParser().unescape(name) if type(name) is not str: name = name.encode('utf-8') return name
def multiple_videos_playlists(params): _id = params.id title = HTMLParser().unescape(params.name) if type(title) == unicode: title = title.encode('utf-8') url = build_url('posts', {'include': _id}) i = requests.get(url).json()[0] content = i.get('content')['rendered'] yt_playlists, yt_videos = parse_ids_content(content) listing = [] for index, playlist in enumerate(yt_playlists): yt_pid, yt_vid = playlist image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid) listing.append({ 'label': "{0} {1}".format(_("Playlist"), index + 1), 'thumb': image, 'is_playable': False, 'url': "plugin://plugin.video.youtube/playlist/{0}/".format(yt_pid), }) for index, yt_vid in enumerate(yt_videos): image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid) listing.append({ 'label': "{0} {1}".format(_("Video"), index + 1), 'thumb': image, 'is_playable': True, 'url': plugin.get_url(action='play', youtube_id=yt_vid, name=title), }) return listing
def list_videos( url, enable_bookmark=True, ): json_data = requests.get(url).json() listing = [] for i in json_data: _id = i.get('id') title = HTMLParser().unescape(i.get('title')['rendered']) if type(title) == unicode: title = title.encode('utf-8') content = i.get('content')['rendered'] date = i.get('date')[:10] slug = i.get('slug') soup = bs4.BeautifulSoup(content, 'html5lib') try: plot = soup.find('meta', {'itemprop': 'description'})['content'] except TypeError: plot = "" yt_playlists, yt_videos = parse_ids_content(content) context_menu = [] if enable_bookmark: context_menu.append( (_("Add to Bookmarks"), 'XBMC.RunPlugin({0})'.format( plugin.get_url(action='add_bookmark', id=_id))), ) context_menu += [ (_("Show tags"), 'XBMC.Container.Update({0})'.format( plugin.get_url(action='tags_by_post', id=_id))), (_("Show categories"), 'XBMC.Container.Update({0})'.format( plugin.get_url(action='categories_by_post', id=_id))), #(_("Force mirror search"), #'XBMC.RunPlugin({0})'.format(plugin.get_url(action='force_mirror', name=title))), ] if len(yt_videos) == 0 and len(yt_playlists) == 0: # search for mirror image = "http://dokustreams.de/wp-content/uploads/{0}.jpg".format( slug) listing.append({ 'label': title, 'thumb': image, 'info': { 'video': { 'title': title, 'plot': plot, 'aired': date, 'year': date[:4], } }, 'context_menu': context_menu, 'is_playable': True, 'url': plugin.get_url(action='play', name=title), }) elif len(yt_videos) == 1 and len( yt_playlists) == 0: # video direkt starten yt_vid = yt_videos[0] image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid) listing.append({ 'label': title, 'thumb': image, 'info': { 'video': { 'title': title, 'plot': plot, 'aired': date, 'year': date[:4], } }, 'context_menu': context_menu, 'is_playable': True, 'url': plugin.get_url(action='play', youtube_id=yt_vid, name=title), }) elif len(yt_videos) == 0 and len( yt_playlists) == 1: # playlist direkt anzeigen yt_pid, yt_vid = yt_playlists[0] image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid) listing.append({ 'label': title, 'thumb': image, 'info': { 'video': { 'title': title, 'plot': plot, 'aired': date, 'year': date[:4], } }, 'context_menu': context_menu, 'is_playable': False, 'url': "plugin://plugin.video.youtube/playlist/{0}/".format(yt_pid), }) else: # playlist und videos zusammen anzeigen image = "http://dokustreams.de/wp-content/uploads/{0}.jpg".format( slug) listing.append({ 'label': title, 'thumb': image, 'context_menu': context_menu, 'is_playable': False, 'url': plugin.get_url(action='multiple_videos_playlists', id=_id, name=title), }) if len(json_data) == PER_PAGE: next_page = page_from_url(url) + 1 next_url = edit_url(url, {'page': next_page}) listing.append({ 'label': '[COLOR blue]{0}[/COLOR]'.format(_("Next page")), 'url': plugin.get_url(action='posts_by_url', url=next_url), }) return listing
'%d0%b3%d1%80%d1%8b%d0%bd%d1%8c', # Gryn` 178 OKc '%D0%BB%D0%B5%D1%89%D1%83%D0%BA', # Leschuk 3929 OKc '%D0%9A%D0%B0%D0%BC%D0%BB%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE', # Kamlichenko 5 OKc '%D0%9A%D0%BE%D0%BC%D0%BB%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE', # Komlichenko 440 OKc '%D0%9A%D0%B0%D0%BB%D0%B5%D0%BD%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE', # Kalenichenko 2019 OKc '%D0%9A%D0%B0%D0%BB%D0%B8%D0%BD%D0%B8%D1%87%D0%B5%D0%BD%D0%BA%D0%BE'] # Kalinichenko 20k # Nadijka, Nadejka 0; Abazovka - net poiska po nasel punktu # Moroz Vadim 200 chel, Andr 994, vlad 2400; pavel 400, mixail 1200,maks 200,roman 407, anat 1208, evg 401, taisia 71, Lubov 1001, tatyana 2000, dmit 700 k0=1; link0='http://nomerorg.com/allukraina/lastName_'+name_list[k0]+'_pagenumber_'; nm=1+int(65645/15); k_proxy=0; u.install_opener(u.build_opener(u.ProxyHandler({'http': proxy_list[k_proxy]}))) for k in range(877,nm): link1=link0+str(k)+'.html'; flag1=False; fd=open(dir0+'name'+str(k0)+'_'+str(10000+k)+'_.csv','w'); print k,k_proxy while not flag1: try: url1=u.urlopen(u.Request(link1,headers=url_hdr),timeout=3); page1=html.fragments_fromstring(url1.read()); flag1=(1<len(page1)); except: k_proxy+=1; u.install_opener(u.build_opener(u.ProxyHandler({'http': proxy_list[k_proxy]}))); print k,k_proxy # print HTMLParser().unescape(u.unquote(page1[0])) o=html.tostring(page1[3]); o=o[:o.find('</table>')-10].replace('</td><td>',',').replace('</td></tr><tr><td>','\n'); o=o[o.rfind('/th></tr><tr><td>')+17:]+'\n'; o=HTMLParser().unescape(u.unquote(o)); if -1<o.find('adsbygoogle'): break fd.write(o.encode('utf-16')); time.sleep(.1); fd.close(); # Forbidden (blocked py queries)/ connection refused (try again) #--- consolidate files k=1; o1=glob.glob(dir0+'name1/name'+str(k)+'_1*.csv') fd=open(dir0+'name'+str(k)+'.csv','wb'); for q in o1: fd_in=open(q,'rb'); o=fd_in.read(); fd_in.close(); fd.write(o); fd.close();
def find_dir(cco): f = { 'q' : cco.encode('utf-8') } u = dir_server + urllib.urlencode(f) r = None try: s = requests.Session() r=s.get(u) print(str(r.text.encode('utf-8'))) headers={'Content-type': 'application/x-www-form-urlencoded'} data="userid="+dir_user+"&password="******"&target=&smauthreason=&smquerydata=&smagentname=&postpreservationdata=&SMENC=&SMLOCALE=" r=s.post(sso_url,data,headers) except requests.exceptions.ConnectionError: return "Connection error to directory server" try: from BeautifulSoup import BeautifulSoup from HTMLParser import HTMLParser except ImportError: from bs4 import BeautifulSoup from html.parser import HTMLParser html = HTMLParser().unescape(r.text) sys.stderr.write("html: "+str(html.encode('utf-8'))+"\n") parsed_html = BeautifulSoup(html) table=parsed_html.body.find('table', attrs={'id':'resultsTable'}) if table is not None: result_list=[unicodedata.normalize('NFKD',i.text) for i in table.findAll('a',attrs={'class':'hover-link'})] found=False for n in result_list: m = re.search(r"\(([A-Za-z0-9]+)\)", n) if m.group(1) == cco: u=dir_detail_server+cco r=s.get(u) print(r.text) html = HTMLParser().unescape(r.text) sys.stderr.write("html: "+str(html.encode('utf-8'))+"\n") parsed_html = BeautifulSoup(html) found=True print("Found!") if not found: txt="Are you looking for one of these people:" for i in result_list: txt+="\n * "+str(i) return txt name=parsed_html.body.find('h2', attrs={'class':'userName'}) sys.stderr.write("name: "+str(name)+"\n") if not hasattr(name, 'text'): return "CCO id not found !" else: tmp=parsed_html.body.find('p', attrs={'class':'userId'}) print("tmp: "+str(tmp)) m=re.search(r"\(([A-Za-z0-9]+)\)", str(tmp)) print("m: "+str(m)) real_cco=str(m.group(1)) sys.stderr.write("real_cco: "+str(real_cco)+"\n") title=parsed_html.body.find('p', attrs={'class':'des'}) sys.stderr.write("title: "+str(title)+"\n") manager=parsed_html.body.find('a', attrs={'class':'hover-link'}) sys.stderr.write("manager: "+str(manager)+"\n") phone_text="" phone=parsed_html.body.find('div', attrs={'id':'dir_phone_links'}) if phone is not None: for p in phone.findAll('p'): if p.text.find("Work") > -1 or p.text.find("Mobile") > -1 : phone_text+=str(p.text)+"<br>" u = str(parsed_html.body.find('div',attrs={'class':'profImg'}).find('img')['src']) response = requests.get(u, stream=True) encoded_string = base64.b64encode(response.raw.read()) return name.text+"<br>;"+title.text.replace('.',' ')+"<br>;"+manager.text+"<br>;"+phone_text+";"+encoded_string+";"+"<a href=\"http://wwwin-tools.cisco.com/dir/details/"+real_cco+"\">directory link</a>"
self.printlog("Connection successful.") except Exception, e: # if fails, retry self.printlog( "Connection failed: %s (retrying in 30 seconds)" % e) time.sleep(30) self.printlog("Retrying... (%s retries left)" % retry['count']) if not page: funcs.printlog( "Connection failed after %s retries. Please check your internet connection. Exiting..." % retry['max']) sys.exit(1) page = page.decode('utf-8') page = HTMLParser().unescape(page) page = page.encode('utf-8') return page class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(self, html):
class FieldStorage: """Store a sequence of fields, reading multipart/form-data. This class provides naming, typing, files stored on disk, and more. At the top level, it is accessible like a dictionary, whose keys are the field names. (Note: None can occur as a field name.) The items are either a Python list (if there's multiple values) or another FieldStorage or MiniFieldStorage object. If it's a single object, it has the following attributes: name: the field name, if specified; otherwise None filename: the filename, if specified; otherwise None; this is the client side filename, *not* the file name on which it is stored (that's a temporary file you don't deal with) value: the value as a *string*; for file uploads, this transparently reads the file every time you request the value file: the file(-like) object from which you can read the data; None if the data is stored a simple string type: the content-type, or None if not specified type_options: dictionary of options specified on the content-type line disposition: content-disposition, or None if not specified disposition_options: dictionary of corresponding options headers: a dictionary(-like) object (sometimes rfc822.Message or a subclass thereof) containing *all* headers The class is subclassable, mostly for the purpose of overriding the make_file() method, which is called internally to come up with a file open for reading and writing. This makes it possible to override the default choice of storing all files in a temporary directory and unlinking them as soon as they have been opened. """ def __init__(self, fp=None, headers=None, outerboundary="", environ=os.environ, keep_blank_values=0, strict_parsing=0): """Constructor. Read multipart/* until last part. Arguments, all optional: fp : file pointer; default: sys.stdin (not used when the request method is GET) headers : header dictionary-like object; default: taken from environ as per CGI spec outerboundary : terminating multipart boundary (for internal use only) environ : environment dictionary; default: os.environ keep_blank_values: flag indicating whether blank values in percent-encoded forms should be treated as blank strings. A true value indicates that blanks should be retained as blank strings. The default false value indicates that blank values are to be ignored and treated as if they were not included. strict_parsing: flag indicating what to do with parsing errors. If false (the default), errors are silently ignored. If true, errors raise a ValueError exception. """ method = 'GET' self.keep_blank_values = keep_blank_values self.strict_parsing = strict_parsing if 'REQUEST_METHOD' in environ: method = environ['REQUEST_METHOD'].upper() self.qs_on_post = None if method == 'GET' or method == 'HEAD': if 'QUERY_STRING' in environ: qs = environ['QUERY_STRING'] elif sys.argv[1:]: qs = sys.argv[1] else: qs = "" fp = StringIO(qs) if headers is None: headers = {'content-type': "application/x-www-form-urlencoded"} if headers is None: headers = {} if method == 'POST': # Set default content-type for POST to what's traditional headers['content-type'] = "application/x-www-form-urlencoded" if 'CONTENT_TYPE' in environ: headers['content-type'] = environ['CONTENT_TYPE'] if 'QUERY_STRING' in environ: self.qs_on_post = environ['QUERY_STRING'] if 'CONTENT_LENGTH' in environ: headers['content-length'] = environ['CONTENT_LENGTH'] self.fp = fp or sys.stdin self.headers = headers self.outerboundary = outerboundary # Process content-disposition header cdisp, pdict = "", {} if 'content-disposition' in self.headers and rfc6266: cd = rfc6266.parse_headers(self.headers['content-disposition'], relaxed=True) cdisp, pdict = cd.disposition, cd.assocs elif 'content-disposition' in self.headers: cdisp, pdict = parse_header(self.headers['content-disposition']) self.disposition = cdisp self.disposition_options = pdict self.name = None if 'name' in pdict: self.name = pdict['name'] self.filename = None if 'filename' in pdict: self.filename = pdict['filename'] if 'filename*' in pdict: self.filename = pdict['filename*'].string if self.filename and '&' in self.filename: from HTMLParser import HTMLParser self.filename = HTMLParser().unescape(self.filename) if isinstance(self.filename, unicode): self.filename = self.filename.encode('utf8') # Process content-type header # # Honor any existing content-type header. But if there is no # content-type header, use some sensible defaults. Assume # outerboundary is "" at the outer level, but something non-false # inside a multi-part. The default for an inner part is text/plain, # but for an outer part it should be urlencoded. This should catch # bogus clients which erroneously forget to include a content-type # header. # # See below for what we do if there does exist a content-type header, # but it happens to be something we don't understand. if 'content-type' in self.headers: ctype, pdict = parse_header(self.headers['content-type']) elif self.outerboundary or method != 'POST': ctype, pdict = "text/plain", {} else: ctype, pdict = 'application/x-www-form-urlencoded', {} self.type = ctype self.type_options = pdict self.innerboundary = "" if 'boundary' in pdict: self.innerboundary = pdict['boundary'] clen = -1 if 'content-length' in self.headers: try: clen = int(self.headers['content-length']) except ValueError: pass if maxlen and clen > maxlen: raise ValueError, 'Maximum content length exceeded' self.length = clen self.list = self.file = None self.done = 0 if ctype == 'application/x-www-form-urlencoded': self.read_urlencoded() elif ctype[:10] == 'multipart/': self.read_multi(environ, keep_blank_values, strict_parsing) else: self.read_single() def __repr__(self): """Return a printable representation.""" return "FieldStorage(%r, %r, %r)" % ( self.name, self.filename, self.value) def __iter__(self): return iter(self.keys()) def __getattr__(self, name): if name != 'value': raise AttributeError, name if self.file: self.file.seek(0) value = self.file.read() self.file.seek(0) elif self.list is not None: value = self.list else: value = None return value def __getitem__(self, key): """Dictionary style indexing.""" if self.list is None: raise TypeError, "not indexable" found = [] for item in self.list: if item.name == key: found.append(item) if not found: raise KeyError, key if len(found) == 1: return found[0] else: return found def getvalue(self, key, default=None): """Dictionary style get() method, including 'value' lookup.""" if key in self: value = self[key] if type(value) is type([]): return map(attrgetter('value'), value) else: return value.value else: return default def getfirst(self, key, default=None): """ Return the first value received.""" if key in self: value = self[key] if type(value) is type([]): return value[0].value else: return value.value else: return default def getlist(self, key): """ Return list of received values.""" if key in self: value = self[key] if type(value) is type([]): return map(attrgetter('value'), value) else: return [value.value] else: return [] def keys(self): """Dictionary style keys() method.""" if self.list is None: raise TypeError, "not indexable" return list(set(item.name for item in self.list)) def has_key(self, key): """Dictionary style has_key() method.""" if self.list is None: raise TypeError, "not indexable" return any(item.name == key for item in self.list) def __contains__(self, key): """Dictionary style __contains__ method.""" if self.list is None: raise TypeError, "not indexable" return any(item.name == key for item in self.list) def __len__(self): """Dictionary style len(x) support.""" return len(self.keys()) def __nonzero__(self): return bool(self.list) def read_urlencoded(self): """Internal: read data in query string format.""" qs = self.fp.read(self.length) if self.qs_on_post: qs += '&' + self.qs_on_post self.list = list = [] for key, value in urlparse.parse_qsl(qs, self.keep_blank_values, self.strict_parsing): list.append(MiniFieldStorage(key, value)) self.skip_lines() FieldStorageClass = None def read_multi(self, environ, keep_blank_values, strict_parsing): """Internal: read a part that is itself multipart.""" ib = self.innerboundary if not valid_boundary(ib): raise ValueError, 'Invalid boundary in multipart form: %r' % (ib,) self.list = [] if self.qs_on_post: for key, value in urlparse.parse_qsl(self.qs_on_post, self.keep_blank_values, self.strict_parsing): self.list.append(MiniFieldStorage(key, value)) FieldStorageClass = None klass = self.FieldStorageClass or self.__class__ part = klass(self.fp, {}, ib, environ, keep_blank_values, strict_parsing) # Throw first part away while not part.done: headers = rfc822.Message(self.fp) part = klass(self.fp, headers, ib, environ, keep_blank_values, strict_parsing) self.list.append(part) self.skip_lines() def read_single(self): """Internal: read an atomic part.""" if self.length >= 0: self.read_binary() self.skip_lines() else: self.read_lines() self.file.seek(0) bufsize = 8*1024 # I/O buffering size for copy to file def read_binary(self): """Internal: read binary data.""" self.file = self.make_file('b') todo = self.length if todo >= 0: while todo > 0: data = self.fp.read(min(todo, self.bufsize)) if not data: self.done = -1 break self.file.write(data) todo = todo - len(data) def read_lines(self): """Internal: read lines until EOF or outerboundary.""" self.file = self.__file = StringIO() if self.outerboundary: self.read_lines_to_outerboundary() else: self.read_lines_to_eof() def __write(self, line): if self.__file is not None: if self.__file.tell() + len(line) > 1000: self.file = self.make_file('') self.file.write(self.__file.getvalue()) self.__file = None self.file.write(line) def read_lines_to_eof(self): """Internal: read lines until EOF.""" while 1: line = self.fp.readline(1<<16) if not line: self.done = -1 break self.__write(line) def read_lines_to_outerboundary(self): """Internal: read lines until outerboundary.""" next = "--" + self.outerboundary last = next + "--" delim = "" last_line_lfend = True while 1: line = self.fp.readline(1<<16) if not line: self.done = -1 break if delim == "\r": line = delim + line delim = "" if line[:2] == "--" and last_line_lfend: strippedline = line.strip() if strippedline == next: break if strippedline == last: self.done = 1 break odelim = delim if line[-2:] == "\r\n": delim = "\r\n" line = line[:-2] last_line_lfend = True elif line[-1] == "\n": delim = "\n" line = line[:-1] last_line_lfend = True elif line[-1] == "\r": # We may interrupt \r\n sequences if they span the 2**16 # byte boundary delim = "\r" line = line[:-1] last_line_lfend = False else: delim = "" last_line_lfend = False self.__write(odelim + line) def skip_lines(self): """Internal: skip lines until outer boundary if defined.""" if not self.outerboundary or self.done: return next = "--" + self.outerboundary last = next + "--" last_line_lfend = True while 1: line = self.fp.readline(1<<16) if not line: self.done = -1 break if line[:2] == "--" and last_line_lfend: strippedline = line.strip() if strippedline == next: break if strippedline == last: self.done = 1 break last_line_lfend = line.endswith('\n') def make_file(self, binary=None): """Overridable: return a readable & writable file. The file will be used as follows: - data is written to it - seek(0) - data is read from it The 'binary' argument is unused -- the file is always opened in binary mode. This version opens a temporary file for reading and writing, and immediately deletes (unlinks) it. The trick (on Unix!) is that the file can still be used, but it can't be opened by another process, and it will automatically be deleted when it is closed or when the current process terminates. If you want a more permanent file, you derive a class which overrides this method. If you want a visible temporary file that is nevertheless automatically deleted when the script terminates, try defining a __del__ method in a derived class which unlinks the temporary files you have created. """ import tempfile return tempfile.TemporaryFile("w+b")
def fill_details_from_wiki(self, url): code = "" try: u = urlopen(url) except: print("AddonManager: Debug: unable to open URL", url) return p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode('utf-8') u.close() # check if the macro page has its code hosted elsewhere, download if needed if "rawcodeurl" in p: rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">", p) if rawcodeurl: rawcodeurl = rawcodeurl[0] try: u2 = urlopen(rawcodeurl) except: print("AddonManager: Debug: unable to open URL", rawcodeurl) return # code = u2.read() # github is slow to respond... We need to use this trick below response = "" block = 8192 #expected = int(u2.headers['content-length']) while 1: #print("expected:",expected,"got:",len(response)) data = u2.read(block) if not data: break if sys.version_info.major >= 3 and isinstance(data, bytes): data = data.decode('utf-8') response += data if response: code = response u2.close() if not code: code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--')) if code: # code = code[0] # take the biggest code block code = sorted(code, key=len)[-1] code = code.replace('--endl--', '\n') else: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to fetch the code of this macro.")) # Clean HTML escape codes. try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser if sys.version_info.major < 3: code = code.decode('utf8') try: code = HTMLParser().unescape(code) code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ') except: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to clean macro code") + ": " + code + '\n') if sys.version_info.major < 3: code = code.encode('utf8') desc = re.findall( "<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' ')) if desc: desc = desc[0] else: FreeCAD.Console.PrintWarning( translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" self.desc = desc self.url = url self.code = code self.parsed = True
def fill_details_from_wiki(self, url): code = "" try: u = urlopen(url) except urllib2.HTTPError: return p = u.read() if sys.version_info.major >= 3 and isinstance(p, bytes): p = p.decode('utf-8') u.close() # check if the macro page has its code hosted elsewhere, download if needed if "rawcodeurl" in p: rawcodeurl = re.findall("rawcodeurl.*?href=\"(http.*?)\">",p) if rawcodeurl: rawcodeurl = rawcodeurl[0] try: u2 = urlopen(rawcodeurl) except urllib2.HTTPError: return # code = u2.read() # github is slow to respond... We need to use this trick below response = "" block = 8192 #expected = int(u2.headers['content-length']) while 1: #print("expected:",expected,"got:",len(response)) data = u2.read(block) if not data: break if sys.version_info.major >= 3 and isinstance(data, bytes): data = data.decode('utf-8') response += data if response: code = response u2.close() if not code: code = re.findall('<pre>(.*?)<\/pre>', p.replace('\n', '--endl--')) if code: # code = code[0] # take the biggest code block code = sorted(code, key=len)[-1] code = code.replace('--endl--', '\n') else: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to fetch the code of this macro.")) # Clean HTML escape codes. try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser if sys.version_info.major < 3: code = code.decode('utf8') try: code = HTMLParser().unescape(code) code = code.replace(b'\xc2\xa0'.decode("utf-8"), ' ') except: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to clean macro code: ") + code + '\n') if sys.version_info.major < 3: code = code.encode('utf8') desc = re.findall("<td class=\"ctEven left macro-description\">(.*?)<\/td>", p.replace('\n', ' ')) if desc: desc = desc[0] else: FreeCAD.Console.PrintWarning(translate("AddonsInstaller", "Unable to retrieve a description for this macro.")) desc = "No description available" self.desc = desc self.url = url self.code = code self.parsed = True
#set file-level metadata for page in item.pages: fileID = itemID + '_' + page.id pagelabel = page.label pageRefURL = page.refurl #set transcription if (('full' in page.info) and page.info['full']): transcription = str(page.info['full'].encode('ascii', 'ignore')) transcription = HTMLParser().unescape(transcription) elif (('fula' in page.info) and page.info['fula']): transcription = str(page.info['fula'].encode('ascii', 'ignore')) transcription = HTMLParser().unescape(transcription) else: transcription = '' #skip if 'n/a' in transcription field if ((alias == 'cwd') and re.match('n/a', transcription.encode('ascii', 'ignore'))): pass else: #set transcription status if (transcription == ''): status = 'Not Started' elif ((alias == 'cwd') and re.match('reviewed', str(page.info['transc']))): status = 'Completed' else: status = 'Needs Review' # if alias == 'cookbooks': #url = tempdir + '/cookbooks_' + page.file.replace('jp2', 'jpg') # code below is for downloading and uploading to server if page.file[-3:] == 'jp2': # #download image to temp directory (downloads locally, move these to dropbox on server) #imagepath = page.imageurl
def list_videos( url, enable_bookmark=True, ): json_data = requests.get(url).json() listing = [] for i in json_data: _id = i.get('id') title = HTMLParser().unescape(i.get('title')['rendered']) if type(title) == unicode: title = title.encode('utf-8') content = i.get('content')['rendered'] date = i.get('date')[:10] yt_vurl = parse_ids_content(content) print "yt_vurl: %s" % yt_vurl if not yt_vurl: print content yt_regex = '(?:youtube(?:-nocookie)?\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})\W' yt_vid = re.findall(yt_regex, yt_vurl)[0] context_menu = [] if enable_bookmark: context_menu.append( (_("Add to Bookmarks"), 'XBMC.RunPlugin({0})'.format( plugin.get_url(action='add_bookmark', id=_id))), ) context_menu += [ (_("Show tags"), 'XBMC.Container.Update({0})'.format( plugin.get_url(action='tags_by_post', id=_id))), (_("Show categories"), 'XBMC.Container.Update({0})'.format( plugin.get_url(action='categories_by_post', id=_id))), #(_("Force mirror search"), #'XBMC.RunPlugin({0})'.format(plugin.get_url(action='force_mirror', name=title))), ] image = "https://i.ytimg.com/vi/{0}/hqdefault.jpg".format(yt_vid) listing.append({ 'label': title, 'thumb': image, 'info': { 'video': { 'title': title, 'aired': date, 'year': date[:4], } }, 'context_menu': context_menu, 'is_playable': True, 'url': plugin.get_url(action='play', youtube_id=yt_vid, name=title), }) if len(json_data) == PER_PAGE: next_page = page_from_url(url) + 1 next_url = edit_url(url, {'page': next_page}) listing.append({ 'label': '[COLOR blue]{0}[/COLOR]'.format(_("Next page")), 'url': plugin.get_url(action='posts_by_url', url=next_url), }) return listing
def node2str(self, xml_node): """XML Node to string""" xml_str = HTMLParser().unescape(str(xml_node)) return xml_str.encode('utf-8') if isinstance(xml_str, unicode) else xml_str
def addEntryToFeedDb(feedXML, feedDb, archiveEntry, cacheImages, rssToolDir, table, isFb): dbc = feedDb.cursor() post = "" if archiveEntry.has_key("content"): post=archiveEntry["content"]["content"] elif archiveEntry.has_key("summary"): post=archiveEntry["summary"]["content"] title = "" if archiveEntry.has_key("title"): title = HTMLParser().unescape(archiveEntry["title"]) post = urllib.unquote(HTMLParser().unescape(post)) url = "" if archiveEntry.has_key("alternate"): url = archiveEntry["alternate"][0]["href"] if title=="": if url=="": print "Warning: No title or URL!" #get ID for post by hashing title with date added to front hashstring = str(title.encode('ascii', 'ignore'))+str(url.replace('https://','http://').encode('ascii', 'ignore')) id = hashlib.sha224(hashstring).hexdigest() #check if post already exists in db and insert if it does not selectedRow = dbc.execute('SELECT * FROM '+table+' WHERE id=?', (id,)).fetchone() if selectedRow == None: if cacheImages==True: #setup images dir xmlfilename = feedXML.replace('http://','').replace('https://','').replace('/','_') if xmlfilename[-1]=='_': xmlfilename = xmlfilename[:-1] imagedir = rssToolDir+"feeds/"+xmlfilename; if os.path.exists(imagedir+'/images') == False: os.makedirs(imagedir+'/images') #get image URLs h=imgParse() h.clear() h.feed(post) imageLinks = h.imgLinks #download images, rename, and replace image URLs in posts j = 0 for image in imageLinks: imagequoted = image image = urllib.unquote(image) targetfile = image.rpartition('/')[2] targetfile = str(j) #if image is from Facebook, handle specially sourceimage = image if isFb==True: fbimage = image fbimage = fbimage.replace('s130x130/','').replace('p130x130/','') fbimage = fbimage.replace('https://','http://') if "url=" in fbimage: fbimage = fbimage.split("url=")[1] sourceimage = fbimage imageType = downloadImage(sourceimage, imagedir+"/images/"+str(id)+'_'+targetfile) #add image info to db if imageType!="NotAnImage": if imageType!=None: imageQuery = (image, str(id)+'_'+targetfile+"."+imageType) selectedImage = dbc.execute('SELECT * FROM images WHERE original=?', (image,)).fetchone() if selectedImage == None: dbc.execute('INSERT INTO images VALUES (?,?)', imageQuery) else: print "Warning: Image "+image+" has already been cached." j = j+1 #package post data into a row for db publishedTime = datetime.fromtimestamp(archiveEntry["published"]) updatedTime = datetime.fromtimestamp(archiveEntry["updated"]) postQuery = (id, title, url, publishedTime, updatedTime, post) dbc.execute('INSERT INTO '+table+' VALUES (?,?,?,?,?,?)', postQuery) feedDb.commit() print "Added post with ID "+str(id)+" to db table "+table return 0 else: i = 0 #print "Warning: Post with ID "+str(id)+" already exists in db." return 1
try: # connect to page page = urllib2.urlopen(url) # open url page = page.read() self.printlog("Connection successful.") except Exception, e: # if fails, retry self.printlog("Connection failed: %s (retrying in 30 seconds)" % e) time.sleep(30) self.printlog("Retrying... (%s retries left)" % retry['count']) if not page: funcs.printlog("Connection failed after %s retries. Please check your internet connection. Exiting..." % retry['max']) sys.exit(1) page = page.decode('utf-8') page = HTMLParser().unescape(page) page = page.encode('utf-8') return page class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(self, html): s = self.MLStripper() s.feed(html)