def parse_itemlisting_style(self): item_tds = self.soup.findAll('td', {'class' : ('itemlisting', 'itemlisting2')}) for td in item_tds: tr = td.findPrevious('tr') item = models.Item() marker = tr.find(text=re.compile("Print the title")) title = marker.nextSibling.strip() title = util.unescape(title) item.title = util.stripNonAscii(title) marker = tr.find(text=re.compile("Print the author")) if marker is None or marker.nextSibling is None: author = '' else: author = marker.nextSibling.strip().strip('.') L = author.split(',') author = ','.join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) marker = tr.find(text=re.compile("Print the date due")) #<td>Due <!--Print the date due--> <strong>12/10/2011,.... dueDate = marker.parent.find('strong').string.strip() dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item print self.itemsOut
def parse(self): duecomments = self.soup.findAll(text=re.compile("Due Date")) for comment in duecomments: tr = comment.findPrevious('tr') item = models.Item() marker = tr.find(text=re.compile("Title")) if marker is None: marker = tr.find(text=re.compile("Print the title")) title = self.findcontent(marker.parent) title = util.unescape(title) item.title = util.stripNonAscii(title) marker = tr.find(text=re.compile("Author")) author = self.findcontent(marker.parent) L = author.split(',') author = ','.join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) marker = tr.find(text=re.compile("Due Date")) dueDate = self.findcontent(marker.parent) dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item
def from_text(cls, text): match = cls.token_re.match(text) if not match: raise ParseTokenError('cannot parse Token from {}'.format(text)) groups = match.groupdict() return cls( unescape(groups['word']), unescape(groups['lemma']), unescape(groups['pos']) )
def from_text(cls, text): match = cls.arg_re.match(text) if not match: raise ParseTokenError('cannot parse Argument from {}'.format(text)) groups = match.groupdict() return cls(unescape(groups['word']), unescape(groups['lemma']), unescape(groups['pos']), groups['ner'] if groups['ner'] != 'NONE' else '', int(groups['entity_idx']) if groups['entity_idx'] else -1, int(groups['mention_idx']) if groups['mention_idx'] else -1)
def from_text(cls, text): match = cls.pred_re.match(text) if not match: raise ParseTokenError( 'cannot parse Predicate from {}'.format(text)) groups = match.groupdict() return cls( unescape(groups['word']), unescape(groups['lemma']), unescape(groups['pos']), True if groups['neg'] is not None else False, unescape(groups['prt']) if groups['prt'] is not None else '')
def _displayEntry(self, index): entry = self.container.items[index-1] urls = util.find_urls(entry.content) title = util.unescape(entry.title).replace("\n", ' ').encode('utf-8') content = util.strip_tags(util.unescape(entry.content)).encode('utf-8') print title print content #uniqify the urls for i in list(set(urls)): print ''.join(i)
def ud(self): if not self.values: self.chat("Whatchu wanna know, bitch?") return try: request = pageopen('http://www.urbandictionary.com/define.php', params={'term': ' '.join(self.values)}) soup = bs4(request.text) except: self.chat("parse error") return elem = soup.find('div', {'class': 'meaning'}) try: defn = [] for string in elem.stripped_strings: defn.append(string) except: self.chat("couldn't find anything") if defn: # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML # entities like ' so use the parser for any stray entities. for paragraph in defn: wrapped = textwrap.wrap(paragraph, 200) for line in wrapped: self.chat(unescape(line)) else: self.chat("couldn't find anything")
def loadDoi(filename): """Load <batch>_doi.xml""" #refs = load_bib_blocks(file) #Lets load the DOI. First we assume unixref full_path = util.sanitizeXML(filename) doc = minidom.parse(full_path) if doc.hasChildNodes(): if doc.childNodes[0].nodeName == "doi_records": keys = doc.getElementsByTagName('doi_record') if doc.childNodes[0].nodeName == "crossref_result": keys = doc.getElementsByTagName('query') else: keys = [] print "Invalid result file ... ignoring %s" % filename #build a dictionary of the keys that have doi doi_keys = {} for key in keys: try: refkey = key.getAttribute("key") refdoi = key.getElementsByTagName("doi") if refdoi: newdoi = refdoi[0].childNodes[0].nodeValue.strip() doi_keys[refkey] = util.unescape(newdoi) except AttributeError: continue return doi_keys
def code(bot, msg, language, _, code): """ Run arbitrary code of the specified language. Usage: @you: @bot python `print [x ** 2 for x in xrange(10) if x % 2]` @bot: [1, 9, 25, 49, 81] Valid languages include python, py3, ruby, coffeescript, gcc (C), and php. """ uri = 'https://eval.in/' data = { "utf8": "\xce\xbb", "execute": "on", "private": "on", "lang": supported_languages[language], "input": "", "code": util.flatten_incoming_text(bot, code).encode('utf-8'), } response = requests.post(uri, data) bot.debug(response.url) _, html = response.content.split("<h2>Program Output</h2>", 1) html = html.lstrip() html = html[5: html.index("</pre>")] output = util.unescape(html).rstrip().decode('utf-8') if output: try: bot.reply(u"```{}```".format(output)) except exception.MessageTooLongException: bot.reply(response.url) else: bot.reply("No output...")
def parse(self): # look for pending fine fine = self.soup.find('div', {'id':'panelVerifyCharges'}) if fine != None: raise PendingFineException row = self.soup.find('div', {'id':'panelMessage'}) titles = row.findAll('i') for title in titles: item = models.Item() reason = title.nextSibling.strip() if reason == 'is renewed.': item.renewed = True item.renewalError = None else: item.renewed = False error_ul = title.findNextSibling('ul') if error_ul == None: item.renewalError = 'Renewal failed' else: item.renewalError = error_ul.li.string titlestr = title.contents[0].strip() titlestr = util.unescape(titlestr) titlestr = util.stripNonAscii(titlestr) self.renewalItems[titlestr] = item
def parse(self): self.form = self.soup.find("form", {"name" : "hasnow"}) row = self.soup.find('input', {'name' : 'HASNOW'}) if row == None: return table = row.findPrevious('table') #print table.__class__.__name__ #print table.prettify() rows = table.findAll('tr') #print len(rows) for itemrow in rows: #print row.__class__.__name__ #print row.prettify() # ignore the header row -- we know it's a header if there isn't a renewal checkbox next to it if itemrow.find('input', {'name':'HASNOW'}) == row.Null: continue item = models.Item() #print row.prettify() renewitemkeys = itemrow.find('input', {'name':'HASNOW'}) divs = itemrow.findAll('div', {'id' : 'globaltext'}) #print len(divs) title = divs[0].string.strip() title = util.unescape(title) item.title = util.stripNonAscii(title) #print title dueDate = divs[4].string.strip() dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item
def parse_title(self, td, item): link = td.find('a') title = util.unescape(link.text.strip(' :/.')) item.title = util.stripNonAscii(title) span = td.find('span') if span is not None and span.text is not None: item.author = span.text.strip(' :/.') return item
def parseTitle(self, td, item): links = td.findAll("a", {"class": lambda (x): x != "boldRedFont1"}) # for some reason many title links have a superfluous ' /' at the end -- remove this title = links[0].string.rstrip(" /") title = util.unescape(title) item.title = util.stripNonAscii(title) author = links[1].string author = author.rstrip(".") if author.startswith("by "): author = author.replace("by ", "", 1) # sometimes there is extraneous information after the author's name, ex: Dylan, Bob, 1941- L = author.split(",") author = ",".join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) return item
def parseTimesRenewed(self, td, item): links = td.findAll("a", {"class": lambda (x): x != "boldRedFont1"}) # some horizon sites leave timesrenewed column blank instead of 0 timesRenewed = links[0].string.strip() timesRenewed = util.unescape(timesRenewed) try: item.timesRenewed = int(timesRenewed) except ValueError: item.timesRenewed = 0 return item
def parseTitle(self, td, item): span = td.find('span') link = span.find('a') if link == None: title = span.contents[0].strip() else: title = link.contents[0].strip() title = util.unescape(title) item.title = util.stripNonAscii(title) return item
def linker(self, urls): for url in urls: # Special behaviour for Twitter URLs match_twitter_urls = re.compile("http[s]?://(www.)?twitter.com/.+/status/([0-9]+)") twitter_urls = match_twitter_urls.findall(url) if len(twitter_urls): self.tweet(twitter_urls) return fubs = 0 title = "Couldn't get title" roasted = "Couldn't roast" urlbase = pageopen(url) if not urlbase: fubs += 1 try: opener = urllib2.build_opener() roasted = opener.open(SHORTENER + url).read() except: fubs += 1 ext = url.split(".")[-1] images = ["gif", "png", "jpg", "jpeg"] if ext in images: title = "Image" elif ext == "pdf": title = "PDF Document" else: try: cont = soup(urlbase, convertEntities=soup.HTML_ENTITIES) title = cont.title.string except: self.chat("Page parsing error") return deli = "https://api.del.icio.us/v1/posts/add?" data = urllib.urlencode({"url": url, "description": title, "tags": "okdrink," + self.lastsender}) if DELICIOUS_USER: base64string = base64.encodestring("%s:%s" % (DELICIOUS_USER, DELICIOUS_PASS))[:-1] try: req = urllib2.Request(deli, data) req.add_header("Authorization", "Basic %s" % base64string) send = urllib2.urlopen(req) except: self.chat("(delicious is down)") if fubs == 2: self.chat("Total fail") else: self.chat(unescape(title) + " @ " + roasted)
def _get_categorys(): error, categorys = _exc_sql( 'select distinct `category` from `findnsave_sale_t` limit 500' ) categorys = categorys or [] for d in categorys: for k in d: d[ k ] = unescape( d[ k ] ) if categorys: categorys.sort( key = lambda x:x['category'] ) return error, categorys
def _get_categorys(): error, categorys = _exc_sql( 'select distinct `category` from `findnsave_sale_t` limit 500') categorys = categorys or [] for d in categorys: for k in d: d[k] = unescape(d[k]) if categorys: categorys.sort(key=lambda x: x['category']) return error, categorys
def get_rail_videos(self, **kwargs): video_count = last_count = 0 videos = util.struct() videos.list = [] videos.next = 1 while video_count < int(self.plugin.get_setting('page_size') or 15): data = requests.get(RAIL_URL % kwargs).text # match video 'rail's # match: (title, video_id, date [DD/MM/AAAA], # thumb, duration [MM:SS], plot) regExp = ( '<li.*data-video-title="(.+?)"[\s]+data-video-id="(.+?)"[\s]+' + 'data-video-data-exibicao="(.+?)">[\s\S]+?' + '<img.+src="(.+?)"[\s\S]+?' + '<span class="duracao.*?">(.+?)</span>[\s\S]+?' + 'div class="balao">[\s]+?<p>[\s]+?([\w].+?)[\s]+?</p>' ) matches = re.compile(regExp).findall(data) mcount = len(matches) properties = ('title', 'id', 'date', 'thumb', 'duration', 'plot') for item in matches: video = util.struct(dict(zip(properties, item))) # update attrs video.title = util.unescape(video.title) video.plot = util.unescape(video.plot) video.date = video.date.replace('/', '.') _split = video.duration.split(':') video.duration = sum(int(x) * 60 ** i for i, x in enumerate(reversed(_split))) self.cache.set('video|%s' % video.id, repr(video)) videos.list.append(video) if mcount == 0 or mcount < last_count: videos.next = None break video_count += mcount last_count = mcount kwargs['page'] += 1 if videos.next: videos.next = kwargs['page'] return videos
def get_rail_videos(self, **kwargs): video_count = last_count = 0 videos = util.struct() videos.list = [] videos.next = 1 while video_count < int(self.plugin.get_setting('page_size') or 15): data = requests.get(RAIL_URL % kwargs).text # match video 'rail's # match: (title, video_id, date [DD/MM/AAAA], # thumb, duration [MM:SS], plot) regExp = ( r'<li.*data-video-title="(.+?)"[\s]+data-video-id="(.+?)"[\s]+' + r'data-video-data-exibicao="(.+?)">[\s\S]+?' + r'<img.+src="(.+?)"[\s\S]+?' + r'<span class="duracao.*?">(.+?)</span>[\s\S]+?' + r'div class="balao">[\s]+?<p>[\s]+?([\w].+?)[\s]+?</p>' ) matches = re.compile(regExp).findall(data) mcount = len(matches) properties = ('title', 'id', 'date', 'thumb', 'duration', 'plot') for item in matches: video = util.struct(dict(zip(properties, item))) # update attrs video.title = util.unescape(video.title) video.plot = util.unescape(video.plot) video.date = video.date.replace('/', '.') _split = video.duration.split(':') video.duration = sum(int(x) * 60 ** i for i, x in enumerate(reversed(_split))) self.cache.set('video|%s' % video.id, repr(video)) videos.list.append(video) if mcount == 0 or mcount < last_count: videos.next = None break video_count += mcount last_count = mcount kwargs['page'] += 1 if videos.next: videos.next = kwargs['page'] return videos
def parse(self): self.form = self.soup.find('form', {'name' : 'renewitems'}) checkboxes = self.form.findAll('input', {'type' : 'checkbox'}) for checkbox in checkboxes: item = models.Item() item.renewitemkey = checkbox['name'] title_label = checkbox.findNext('td').label title = title_label.contents[2].strip() title = util.unescape(title) item.title = util.stripNonAscii(title) self.renewalitems[item.title] = item
def _get_findnsave_data( table, cols ): error = None sql = "select %s from `%s`" % ( ', '.join( [ '`%s`' % c for c in cols ] ), table ) logger.info( sql ) error, data = _exc_sql( sql ) data = data or [] for d in data: for k in d: d[ k ] = unescape( d[ k ] ) return error, data
def _get_findnsave_data(table, cols): error = None sql = "select %s from `%s`" % (', '.join(['`%s`' % c for c in cols]), table) logger.info(sql) error, data = _exc_sql(sql) data = data or [] for d in data: for k in d: d[k] = unescape(d[k]) return error, data
def _displayFeeds(self, forceRefresh=False): if forceRefresh: self.container.clearItems() self.container.loadItems() index = 1 for item in self.container.getItems(): title = util.unescape(item.title).replace("\n", ' ').encode('utf-8') if len(title) > 80: title = title[0:77] + '...' author = item.author or '' author = author.encode('utf-8') if item.isUnread(): print "%2s: %s [%s]" % (index, title, author) index += 1
def urlparse(self, url): if self.cleanse(url) == False: return [url] fubs = 0 title = "Couldn't get title" site = Browser(url) if site.error: self.chat('Total fail: %s' % site.error) return [url] roasted = shorten(url) if not roasted: roasted = "Couldn't roast" fubs += 1 self.chat('%s @ %s' % (unescape(site.title()), roasted)) return [url]
def do_directmessage(self, **kwargs): if not (kwargs.has_key("params")): for dm in reversed(self.tnt.getDirectMessages()): print_string = u"%(threading_color)s%(threading)s%(reset_color)s%(datetime)s %(id_color)s%(id)d : %(name)s%(nick_color)s%(username)s %(text_color)s: %(text)s%(reset_color)s" tweet_out_data = {"reset_color": COLOR_WHITE} tweet_out_data["username"] = u"@" + dm.sender_screen_name tweet_out_data["id"] = self.tnt.getIdFor(pos) tweet_out_data["text"] = util.unescape(self.tnt.getTextFor(pos)) tweet_out_data["datetime"] = unicode( datetime.datetime.fromtimestamp(self.tnt.getTimeFor(pos)).strftime("%H:%M:%S") ) tweet_out_data["id_color"] = COLOR_CYAN tweet_out_data["text_color"] = COLOR_YELLOW toprint = dm.sender_screen_name + u": " + dm.text print(toprint.encode("utf-8")) elif len(kwargs["params"].split(" ")) >= 2: user = kwargs["params"].split(" ")[0] message = " ".join(kwargs["params"].split(" ")[1:]) self.tnt.sendDirectMessage(user, message) else: print(u"──> DM: wrong param set".encode("utf-8"))
def printStatus(self, pos): print_string = u"%(reset_color)s%(datetime)s %(id_color)s%(id)d :%(threading_color)s%(threading)s %(name)s%(nick_color)s%(username)s : %(text_color)s%(text)s%(reset_color)s" tweet_out_data = {"reset_color": COLOR_WHITE} # First, the info if self.config["showFullName"]: tweet_out_data["name"] = self.tnt.getAuthorNameFor(pos) + " " else: tweet_out_data["name"] = u"" tweet_out_data["username"] = u"@" + self.tnt.getAuthorScreenNameFor(pos) tweet_out_data["id"] = self.tnt.getIdFor(pos) tweet_out_data["text"] = util.unescape(self.tnt.getTextFor(pos)) tweet_out_data["datetime"] = unicode( datetime.datetime.fromtimestamp(self.tnt.getTimeFor(pos)).strftime("%H:%M:%S") ) order = self.tnt.getThreadPositionOf(self.tnt.getIdFor(pos)) if order > 0: tweet_out_data["threading"] = u" └─" + u"─" * (order - 1) + u"> " else: tweet_out_data["threading"] = "" # we set colors # if the tweet speaks about the user tweet_out_data["threading_color"] = COLOR_RED tweet_out_data["nick_color"] = COLOR_RED if self.tnt.getTextFor(pos).find(self.tnt.getUser().GetScreenName()) > -1: tweet_out_data["id_color"] = COLOR_CYAN tweet_out_data["text_color"] = COLOR_YELLOW # if the tweet's author is the user or the author is in the hilight list elif self.tnt.getAuthorScreenNameFor(pos).find(self.tnt.getUser().GetScreenName()) > -1: tweet_out_data["id_color"] = COLOR_CYAN tweet_out_data["text_color"] = COLOR_GREEN elif "@" + self.tnt.getAuthorScreenNameFor(pos) in self.config["hilight"]: tweet_out_data["id_color"] = COLOR_MAGENTA tweet_out_data["text_color"] = COLOR_MAGENTA # if it's a normal tweet else: tweet_out_data["id_color"] = COLOR_GREEN tweet_out_data["text_color"] = COLOR_WHITE # now we print it final_print_string = print_string % tweet_out_data print(final_print_string.encode("utf-8"))
def parse(self): #print self.soup.prettify() dds = self.soup.findAll('dd') for dd in dds: item = models.Item() reasonSoup = dd.findPrevious('strong') print reasonSoup.prettify() reason = util.inner_text(reasonSoup) print "reason=" + reason if reason == 'Item renewed': item.renewed = True item.renewalError = None else: item.renewed = False item.renewalError = reason title = dd.contents[0].strip() title = util.unescape(title) title = util.stripNonAscii(title) self.renewalItems[title] = item
def ud(self): if not self.values: return "Whatchu wanna know, bitch?" term = ' '.join(self.values) term = term.strip() if term == 'truffle butter': return "You all know what it is, and I don't want to have to read this shit again." try: request = Browser('http://www.urbandictionary.com/define.php', params={'term': term}) soup = request.soup() except: return "parse error" elem = soup.find('div', {'class': 'meaning'}) try: defn = [] for string in elem.stripped_strings: defn.append(string) except: return "couldn't find anything" if not defn: return "couldn't find anything" # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML # entities like ' so use the parser for any stray entities. response = [] for paragraph in defn: wrapped = textwrap.wrap(paragraph, 200) _response = unescape(' '.join(wrapped)) response.append(_response) return ' '.join(response)
def getMyVideos(session): result = [] content = session.get(youtubeUrl + 'my_videos' + '?' + urllib.urlencode({'o': 'U'})).text dummy, i = util.substr('"VIDEO_LIST_DISPLAY_OBJECT"', ':', content) data = json.loads(util.parseBrackets(content, i, ['[', ']'])) for item in data: soup = BeautifulSoup( util.unescape(item['html'].decode('unicode_escape')), "html.parser") ptag = soup.find(class_="vm-video-indicators") privacy = 'Public' if not ptag.find(class_='vm-unlisted').parent.has_attr('aria-hidden'): privacy = 'Private' if not ptag.find(class_='vm-private').parent.has_attr('aria-hidden'): privacy = 'Private' try: duration = util.timeStrToSeconds( soup.find(class_="video-time").get_text()) except: duration = '' result.append({ 'id': item['id'], 'name': soup.find(class_="vm-video-title-content").get_text(), 'thumb': videoImage(item['id']), 'duration': duration, 'privacy': privacy, 'user': '******' }) return (result)
def linker(self, urls): for url in urls: # Special behaviour for Twitter URLs match_twitter_urls = re.compile('http[s]?://(www.)?twitter.com/.+/status/([0-9]+)') twitter_urls = match_twitter_urls.findall(url) if len(twitter_urls): self.tweet(twitter_urls) return if url.find('gist.github') != -1: return if randint(1, 5) == 1: try: self.commands.get('tweet', self.default)(url) except: pass fubs = 0 title = "Couldn't get title" site = Browse(url) if site.error: self.chat('Total fail: %s' % site.error) continue roasted = shorten(url) if not roasted: roasted = "Couldn't roast" fubs += 1 try: ext = site.headers()['content-type'].split('/')[1] except: ext = False images = [ 'gif', 'png', 'jpg', 'jpeg', ] if ext in images: title = 'Image' # Switch this to a Browse method if STORE_IMGS: fname = url.split('/').pop() path = IMGS + fname self.butler.do(savefromweb, (url, path, self.lastsender), 'Thumb @ %s') elif ext == 'pdf': title = 'PDF Document' else: title = site.title() # If you have a delicious account set up. Yes, delicious # still exists. Could be updated to a cooler link # collecting service. if STORE_URLS: postdelicious(url, title, self.lastsender) if fubs == 2: self.chat("Total fail") else: self.chat("%s @ %s" % (unescape(title), roasted))
def findnsave_sale(): error = None #err, areas = _get_findnsave_data( 'findnsave_area', ('area',) ) areas = [{'area': 'newyork'}] #err, stores = _get_findnsave_data( 'findnsave_store', ('name',) ) stores = [ { 'name': 'Walmart' }, { 'name': 'Target' }, { 'name': 'Toys"R"Us' }, ] err, brands = _get_findnsave_data('findnsave_brand', ('name', )) err, categorys = _get_categorys() if request.method == 'POST': keywords = request.form['keywords'].strip() area = request.form['select_area'] store = request.form['select_store'] brand = request.form['select_brand'] category = request.form['select_category'] num = request.form['number'] action = request.form['action'] where = [] if area != 'All': where.append("`area`=%s" % esql.escape_string(area)) if store != 'All': where.append("`retailer`=%s" % esql.escape_string(store)) if brand != 'All': where.append("`brand`=%s" % esql.escape_string(brand)) if category != 'All': where.append("`category`=%s" % esql.escape_string(category)) keywords = keywords.split() for kw in keywords: where.append("`name` like %s" % esql.escape_string('%' + kw + '%')) if where: where = 'where ' + ' and '.join(where) else: where = '' cols = ( '_ID', 'area', 'name', 'retailer', 'brand', 'category', 'price', 'priceRegular', 'priceOff', 'priceUtilDate', ) sql = "select * from `%s` %s" % ( 'findnsave_sale_t', where, ) if num != 'total': sql += ' limit ' + num logger.info(sql) try: db = esql.Database(conf.dbconf) data = db.conn.read(sql) except Exception as e: logger.exception(repr(e)) error = repr(e) data = [] for d in data: for k in d: d[k] = unescape(d[k]) if d[k] == '': d[k] = 'not specified' if action == 'export': name = ','.join([area, store, brand, num]) return _findnsave_sale_download(name, data, cols + ('desc', )) return render_template('findnsave_show_sales.html', error=error, areas=areas, stores=stores, brands=brands, categorys=categorys, sales=data, cols=cols) return render_template('findnsave_show_sales.html', error=error, areas=areas, stores=stores, brands=brands, categorys=categorys, sales=[], cols=[])
def render(self): self.body_html = _get_markup_formatter()(self.body) # Remove tags which was generated with the markup processor text = strip_tags(self.body_html) # Unescape entities which was generated with the markup processor self.body_text = unescape(text)
res_data["food"] = 0 res_data["decor"] = 0 res_data["service"] = 0 res_data["cost"] = 0 res_data_list=[] e2 = soup.find_all("div",class_="case js-case") # print e2[0] # print e2[0].find("div",class_="image") for elem in e2: t = elem.find("div",class_="text") res_name = t.find("div",class_="text-cnt Restaurants").a.text res_cui = unescape(t.find("div",class_="text-cnt Restaurants").p.text) # str.decode("utf-8").replace(res_cui, "@") special = u"\u2022" res_cui = res_cui.replace(special,'@') res_cui = parseres_name(res_cui) stats = t.select(".text-stats") res_data["name"] = res_name res_data["cuisine"] = res_cui res_data["food"] = convert(t.select(".i-number.i-number-red")[0].text) res_data["decor"] = convert(t.select(".i-number")[0].text) res_data["service"] = convert(t.select(".i-number")[1].text) res_data["cost"] = convert(t.select(".i-number")[2].text) print res_data newd = res_data.copy() res_data_list.append(newd)
def real_filename_complete(self, text, line, begidx, endidx): """Figure out what filenames match the completion.""" # line contains the full command line that's been entered so far. # text contains the portion of the line that readline is trying to complete # text should correspond to line[begidx:endidx] # # The way the completer works text will start after one of the characters # in DELIMS. So if the filename entered so far was "embedded\ sp" and # then text will point to the s in sp. # # The following bit of logic backs up to find the real beginning of the # filename. for before_match in range(begidx, 0, -1): if line[before_match] in self.DELIMS and before_match >= 1 and line[ before_match - 1] != '\\': break # We set fixed to be the portion of the filename which is before text # and match is the full portion of the filename that's been entered so # far (that's that part we use for matching files). # # When we return a list of completions, the bit that we return should # just be the portion that we replace 'text' with. # fixed portion of the match fixed = unescape(line[before_match + 1:begidx]) # portion to match filenames against match = unescape(line[before_match + 1:endidx]) # We do the following to cover the case that the current directory # is / and the path being entered is relative. if match[0] == '/': abs_match = match elif self.cur_dir == '/': abs_match = self.cur_dir + match else: abs_match = self.cur_dir + '/' + match completions = [] prepend = '' if abs_match.rfind('/') == 0: # match is in the root directory # This means that we're looking for matches in the root directory # (i.e. abs_match is /foo and the user hit TAB). # So we'll supply the matching board names as possible completions. # Since they're all treated as directories we leave the trailing slash. if match[0] == '/': completions += [ dev.name_path for dev in self.boards.boards() if dev.name_path.startswith(abs_match) ] else: completions += [ dev.name_path[1:] for dev in self.boards.boards() if dev.name_path.startswith(abs_match) ] try: # Add root directories of the default device def_dev = self.boards.default if match[0] == '/': completions += [ root_dir for root_dir in def_dev.root_dirs if root_dir.startswith(match) ] else: completions += [ root_dir[1:] for root_dir in def_dev.root_dirs if root_dir[1:].startswith(match) ] except BoardError: pass else: # This means that there are at least 2 slashes in abs_match. If one # of them matches a board name then we need to remove the board # name from fixed. Since the results from listdir_matches won't # contain the board name, we need to prepend each of the completions. for dev in self.boards.boards(): if abs_match.startswith(dev.name_path): prepend = dev.name_path[:-1] paths = sorted(auto(self.boards, listdir_matches, match)) for path in paths: path = prepend + path completions.append(escape(path.replace(fixed, '', 1))) return completions
def findnsave_sale(): error = None #err, areas = _get_findnsave_data( 'findnsave_area', ('area',) ) areas = [ {'area':'newyork'} ] #err, stores = _get_findnsave_data( 'findnsave_store', ('name',) ) stores = [ { 'name' : 'Walmart' }, { 'name' : 'Target' }, { 'name' : 'Toys"R"Us' }, ] err, brands = _get_findnsave_data( 'findnsave_brand', ('name',) ) err, categorys = _get_categorys() if request.method == 'POST': keywords = request.form[ 'keywords' ].strip() area = request.form[ 'select_area' ] store = request.form[ 'select_store' ] brand = request.form[ 'select_brand' ] category = request.form[ 'select_category' ] num = request.form[ 'number' ] action = request.form[ 'action' ] where = [] if area != 'All': where.append( "`area`=%s" % esql.escape_string(area) ) if store != 'All': where.append( "`retailer`=%s" % esql.escape_string(store) ) if brand != 'All': where.append( "`brand`=%s" % esql.escape_string(brand) ) if category != 'All': where.append( "`category`=%s" % esql.escape_string(category) ) keywords = keywords.split() for kw in keywords: where.append( "`name` like %s" % esql.escape_string('%'+kw+'%') ) if where: where = 'where ' + ' and '.join( where ) else: where = '' cols = ( '_ID', 'area', 'name', 'retailer', 'brand', 'category', 'price', 'priceRegular', 'priceOff', 'priceUtilDate', ) sql = "select * from `%s` %s" % ( 'findnsave_sale_t', where, ) if num != 'total': sql += ' limit ' + num logger.info( sql ) try: db = esql.Database( conf.dbconf ) data = db.conn.read( sql ) except Exception as e : logger.exception( repr(e) ) error = repr(e) data = [] for d in data: for k in d: d[ k ] = unescape( d[ k ] ) if d[ k ] == '': d[ k ] = 'not specified' if action == 'export': name = ','.join( [ area, store, brand, num ] ) return _findnsave_sale_download( name, data, cols + ( 'desc', ) ) return render_template( 'findnsave_show_sales.html', error = error, areas = areas, stores = stores, brands = brands, categorys = categorys, sales = data, cols = cols ) return render_template( 'findnsave_show_sales.html', error = error, areas = areas, stores = stores, brands = brands, categorys = categorys, sales = [], cols = [] )
def linker(self, urls): for url in urls: # Special behaviour for Twitter URLs match_twitter_urls = re.compile( 'http[s]?://(www.)?twitter.com/.+/status/([0-9]+)') twitter_urls = match_twitter_urls.findall(url) if len(twitter_urls): self.tweet(twitter_urls) return if url.find('gist.github') != -1: return if randint(1, 5) == 1: try: self.commands.get('tweet', self.default)(url) except: pass fubs = 0 title = "Couldn't get title" site = Browse(url) if site.error: self.chat('Total fail: %s' % site.error) continue roasted = shorten(url) if not roasted: roasted = "Couldn't roast" fubs += 1 try: ext = site.headers()['content-type'].split('/')[1] except: ext = False images = [ 'gif', 'png', 'jpg', 'jpeg', ] if ext in images: title = 'Image' # Switch this to a Browse method if STORE_IMGS: fname = url.split('/').pop() path = IMGS + fname self.butler.do(savefromweb, (url, path, self.lastsender), 'Thumb @ %s') elif ext == 'pdf': title = 'PDF Document' else: title = site.title() # If you have a delicious account set up. Yes, delicious # still exists. Could be updated to a cooler link # collecting service. if STORE_URLS: postdelicious(url, title, self.lastsender) if fubs == 2: self.chat("Total fail") else: self.chat("%s @ %s" % (unescape(title), roasted))
def search_site(self, url, resource_dict): """Downloads the URL's content, searches for the paths and patterns and builds a message out of the matched data. Arguments: resource_dict contains the paths, patterns and additional data for the url. """ if self.sitedata is None: return # retrieve content try: content = download_page(url).decode(WEB_ENCODING, "replace") except: return if content is None: return message = None title = None def info_xpath(): # try to find info using xpath root = lxml.html.fromstring(content) items = root.xpath(info["xpath"]) logger.debug("using xpath: " + info["xpath"]) if items is not None and len(items) >= 1: return items[0] else: return None def info_regex(): # try to find info using a regex pattern logger.debug("using regex: " + info["pattern"]) match = re.search(info["pattern"], content) if match is None: logger.warning( "Could not find info! (match == None) with pattern: " + info["pattern"]) return None if match.groups() is None: logger.warning("match.groups() was None") return None if len(match.groups()) <= 0: logger.warning("Found match but no groups") return None return match.group(1) for info in resource_dict["patterns"]: if not "pattern" in info and not "xpath" in info: logger.error( "siteinfo entry does not contain a path or pattern!") break infodata = None # try regex first because it seems to be faster if "pattern" in info: infodata = info_regex() # try xpath if there was no pattern or regex was unsuccessful if infodata is None and "xpath" in info: infodata = info_xpath() if infodata is None: logger.warning("infodata was None!") break logger.debug("\ninfodata:\n") logger.debug(infodata) if infodata is None or infodata == "": continue logger.info("found info data: " + infodata) infodata = unescape(infodata) infodata = escape(infodata) infodata = infodata.strip() if title is None: title = infodata color = REST_COLOR style = REST_STYLE if message is None: message = "" color = FIRST_COLOR style = FIRST_STYLE message += self.msg_formats.get( style, self.msg_formats.get(color, infodata)) if info != resource_dict["patterns"][-1]: message += " " + self.sitedata["separator"] + " " # cut last separator if there is one sep = self.sitedata["separator"] if message is not None and message.strip()[-len(sep):] == sep: message = message.strip()[:-len(sep)].strip() return message, title
def create_report(model): """Create txt and html reports based on model values""" jinja_environment = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), extensions=['jinja2.ext.autoescape'], autoescape=True) q_countries, q_dates, q_queries = countries_dates_queries(model, 'downloads') s_countries, s_dates, s_queries = countries_dates_queries(model, 'searches') try: m_year_downloads = model['year']['downloads'] m_year_records = model['year']['records'] except KeyError: m_year_downloads = 'No data' m_year_records = 'No data' try: m_year_searches = model['year']['searches'] m_year_s_records = model['year']['s_records'] except KeyError: m_year_searches = 'No data' m_year_s_records = 'No data' try: m_hist_downloads = model['history']['downloads'] m_hist_records = model['history']['records'] except KeyError: m_hist_downloads = 'No data' m_hist_records = 'No data' try: m_hist_searches = model['history']['searches'] m_hist_s_records = model['history']['s_records'] except KeyError: m_hist_searches = 'No data' m_hist_s_records = 'No data' template_values = { # General values 'inst': model['inst'], 'resname': model['col'], 'time_lapse': model['report_month_string'], 'generated': model['created_at'], # Downloads 'downloads': model['downloads']['downloads'], 'total_downloads': model['downloads']['downloads_period'], 'records': model['downloads']['records'], 'total_records': model['downloads']['records_period'], 'unique_records': model['downloads']['records_unique'], 'len_countries': len(model['downloads']['countries_list']), 'countries': q_countries, 'query_dates': q_dates, 'queries': q_queries, # Searches 'searches': model['searches']['searches'], 'records_searched': model['searches']['records'], 's_len_countries': len(model['searches']['countries_list']), 's_countries': s_countries, 's_query_dates': s_dates, 's_queries': s_queries, # Cumulative data 'year_downloads': m_year_downloads, 'year_records': m_year_records, 'year_searches': m_year_searches, 'year_s_records': m_year_s_records, 'history_downloads': m_hist_downloads, 'history_records': m_hist_records, 'history_searches': m_hist_searches, 'history_s_records': m_hist_s_records } template_txt = jinja_environment.get_template('template.txt') report_txt = unescape(template_txt.render(template_values)) template_html = jinja_environment.get_template('template.html') report_html = template_html.render(template_values) return report_txt, report_html
def search_site(self, url, resource_dict): """Downloads the URL's content, searches for the paths and patterns and builds a message out of the matched data. Arguments: resource_dict contains the paths, patterns and additional data for the url. """ if self.sitedata is None: return # retrieve content try: content = download_page(url).decode(WEB_ENCODING, "replace") except: return if content is None: return message = None title = None def info_xpath(): # try to find info using xpath root = lxml.html.fromstring(content) items = root.xpath(info["xpath"]) logger.debug("using xpath: " + info["xpath"]) if items is not None and len(items) >= 1: return items[0] else: return None def info_regex(): # try to find info using a regex pattern logger.debug("using regex: " + info["pattern"]) match = re.search(info["pattern"], content) if match is None: logger.warning("Could not find info! (match == None) with pattern: " + info["pattern"]) return None if match.groups() is None: logger.warning("match.groups() was None") return None if len(match.groups()) <= 0: logger.warning("Found match but no groups") return None return match.group(1) for info in resource_dict["patterns"]: if not "pattern" in info and not "xpath" in info: logger.error("siteinfo entry does not contain a path or pattern!") break infodata = None # try regex first because it seems to be faster if "pattern" in info: infodata = info_regex() # try xpath if there was no pattern or regex was unsuccessful if infodata is None and "xpath" in info: infodata = info_xpath() if infodata is None: logger.warning("infodata was None!") break logger.debug("\ninfodata:\n") logger.debug(infodata) if infodata is None or infodata == "": continue logger.info("found info data: " + infodata) infodata = unescape(infodata) infodata = escape(infodata) infodata = infodata.strip() if title is None: title = infodata color = REST_COLOR style = REST_STYLE if message is None: message = "" color = FIRST_COLOR style = FIRST_STYLE message += self.msg_formats.get(style, self.msg_formats.get(color, infodata)) if info != resource_dict["patterns"][-1]: message += " " + self.sitedata["separator"] + " " # cut last separator if there is one sep = self.sitedata["separator"] if message is not None and message.strip()[-len(sep) :] == sep: message = message.strip()[: -len(sep)].strip() return message, title