def export_news(offset, limit, force, export_path): for (dirpath, dirnames, filenames) in os.walk("to_import"): for filename in filenames: with open(os.sep.join((dirpath, filename))) as opened_file: parser = BeautifulSoup(opened_file, 'xml') if not os.path.exists(export_path): os.makedirs(export_path) rows = parser.find_all("content") for row in rows: title = row.find("title").text url = row.find("url").text.replace("/administrator", "") html_parser = HTMLParser() url = html_parser.unescape(html_parser.unescape(url)) pub_date = row.find("publish_up").text mod_date = row.find("modified").text featured = bool(int(row.find("featured").text)) if mod_date == "0000-00-00 00:00:00": mod_date = pub_date res = prepare_dict(url) if not res: logger.warning("error for url %s" % url) continue res["title"] = title res["id"] = normalize(title, max_length=200) res["category"] = row.find("catid").text res["pub_date"] = pub_date res["mod_date"] = mod_date res["featured"] = featured res["hits"] = row.find("hits").text save_json(export_path, res) break
def getSets(self): """ set names are always mined from the same, trusted source: magiccards.info @return: """ os.environ['http_proxy'] = '' proxies = { # "http": "http://*****:*****@3.187.59.236:9400", # "https": "http://*****:*****@3.187.59.236:9400", } results = requests.get("http://magiccards.info/search.html", proxies=proxies, timeout=10) # print results.text setstag = re.search(r'<label\s+for="edition">.*?<option value=""></option>(.*?)</select>', results.text, re.DOTALL) setsraw = setstag.group(1).strip() # print setsraw setnames = [] hp = HTMLParser() print setsraw for st in striplist(setsraw.split(r'option>')): print 'stmax', st rawElem = re.search(r'value="(.*)">(.*?)<', st) if rawElem: setkey = hp.unescape(rawElem.group(1)) setname = hp.unescape(rawElem.group(2)) print setkey, setname setnames.append(setname) setnames.remove('All Sets') self.setnames = setnames
def get_user_realname(user): from ckanext.dgu.drupalclient import DrupalClient from HTMLParser import HTMLParser if user.name.startswith('user_d'): user_id = user.name[len('user_d'):] html_parser = HTMLParser() try: dc = DrupalClient() properties = dc.get_user_properties(user_id) except Exception, ex: return user.fullname try: first_name = properties['field_first_name']['und'][0]['safe_value'] first_name = html_parser.unescape(first_name) except: first_name = '' try: surname = properties['field_surname']['und'][0]['safe_value'] surname = html_parser.unescape(surname) except: surname = ''
def iter_movies(self, pattern): res = self.readurl("http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s" % pattern.encode("utf-8")) jres = json.loads(res) htmlparser = HTMLParser() for cat in ["title_popular", "title_exact", "title_approx"]: if cat in jres: for m in jres[cat]: tdesc = unicode(m["title_description"]) if "<a" in tdesc and ">" in tdesc: short_description = u"%s %s" % ( tdesc.split("<")[0].strip(", "), tdesc.split(">")[1].split("<")[0], ) else: short_description = tdesc.strip(", ") movie = Movie(m["id"], htmlparser.unescape(m["title"])) movie.other_titles = NotLoaded movie.release_date = NotLoaded movie.duration = NotLoaded movie.short_description = htmlparser.unescape(short_description) movie.pitch = NotLoaded movie.country = NotLoaded movie.note = NotLoaded movie.roles = NotLoaded movie.all_release_dates = NotLoaded movie.thumbnail_url = NotLoaded yield movie
def logger(self, message, msg_type, operation_name, counter, msg_id): # Метод логирования soap пакетов try: try: if msg_type == 'RESPONSE' and operation_name != 'GetStats' and operation_name != 'GetEvents': pars = HTMLParser() message = pars.unescape(message) message = unicode(message, 'utf-8') if msg_type == 'RESPONSE' and operation_name == 'GetEvents': message = unicode(message, 'utf-8') else: message = unicode(message, 'utf-8') except: pars = HTMLParser() message = pars.unescape(message) if not os.path.exists(self.logs_directory): os.makedirs(self.logs_directory) log = codecs.open(os.path.join(self.logs_directory, msg_id + "_" + self.appdate + "_" + operation_name + "_OperationLog.xml"), 'a', encoding='utf8') log.write("--------------Start of " + msg_type + " - Transaction Number is " + str(counter) + "--------------------\n" + message) log.write("\n--------------End of " + msg_type + "- Transaction Number is " + str(counter) + "--------------------\n") log.close() return "Success creating LOG " + operation_name + " - " + msg_type + ": " except Exception: return "Failure creating LOG " + operation_name + " - " + msg_type + ": " + '\n' + traceback.format_exc()
def getImageLocation(comicRequest): titleString = 'id="ctitle">' captionString = 'title="' imageString = '//imgs.xkcd.com/comics/' response = urllib2.urlopen(parseComicRequest(comicRequest)) html = response.read() titleStart = html.find(titleString) + len(titleString) titleEnd = html[titleStart:].find('<') + titleStart title = html[titleStart:titleEnd] imageAddressStart = html.find(imageString) imageAddressEnd = html[imageAddressStart:].find('"') + imageAddressStart imageAddress = html[imageAddressStart:imageAddressEnd] captionStart = ( html[imageAddressEnd:].find(captionString) + imageAddressEnd + len(captionString) ) captionEnd = html[captionStart:].find('"') + captionStart caption = html[captionStart:captionEnd] parser = HTMLParser() caption = parser.unescape(caption) title = parser.unescape(title) return '*' + title + "*\nhttp:" + str(imageAddress) + '\n' + caption
def getSets(self): """ @return: Set names found """ results = requests.get("http://magiccards.info/search.html", proxies=self.proxies, timeout=10) # print results.text setstag = re.search(r'<label\s+for="edition">.*?<option value=""></option>(.*?)</select>', results.text, re.DOTALL) setsraw = setstag.group(1).strip() setnames = [] hp = HTMLParser() print setsraw for st in striplist(setsraw.split(r'option>')): print 'stmax', st rawElem = re.search(r'value="(.*)">(.*?)<', st) if rawElem: setkey = hp.unescape(rawElem.group(1)) setname = hp.unescape(rawElem.group(2)) if not self.db.session.query(Edition).filter(Edition.name == setname).count(): ed = Edition(setname) ed.sexps.append(EditionSexp(setkey, 'http://magiccards.info')) ed.save(self.db.session) print setkey, setname setnames.append(setname) try: self.db.session.commit() except IntegrityError as ie: print ie.message logger.severe('', ie) return setnames
def main(): html_parser = HTMLParser() soup = BeautifulSoup(urlopen("http://www.amazon.com/gp/bestsellers/").read()) categories = [] # Scrape list of category names and urls for category_li in soup.find(attrs={'id':'zg_browseRoot'}).find('ul').findAll('li'): category = {} category['name'] = html_parser.unescape(category_li.a.string) category['url'] = category_li.a['href'] categories.append(category) del soup # Loop through categories and print out each product's name, rank, and url. for category in categories: print category['name'] print '-'*50 soup = BeautifulSoup(urlopen(category['url'])) i = 1 for title_div in soup.findAll(attrs={'class':'zg_title'}): if i ==1: print "%d. %s\n %s" % (i, html_parser.unescape(title_div.a.string), title_div.a['href'].strip()) i += 1 print ''
class IssuesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider() def collect_urls(self): records = [] items = self.get(self.url).findAll("item") for item in items: record = { "title": self.html.unescape(item.title.text), "timestamp_publish": parser.parse(item.pubdate.text), "site": "berniesanders.com", "lang": "en", "description_html": item.description.text, "description": self.html.unescape(BeautifulSoup(item.description.text).p.text), "url": item.link.nextSibling, } records.append(record) return records def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property": "og:image"}) record["image_url"] = meta_image[0]["content"].encode("utf8") # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() record["body_html"] = str(soup.article) text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == "br": text.append("") record["body"] = "\n".join(text) return record def go(self): urls = self.collect_urls() if not urls: logging.critical("Could not retrieve issues.") sys.exit(1) for url in urls: record = self.retrieve(url) if self.issue_provider.exists_by_url(record["url"]): print "found" else: msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() self.issue_provider.create(record)
def iter_movies(self, pattern): res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s' % pattern.encode('utf-8')) jres = json.loads(res) htmlparser = HTMLParser() for cat in ['title_popular', 'title_exact', 'title_approx']: if cat in jres: for m in jres[cat]: tdesc = unicode(m['title_description']) if '<a' in tdesc and '>' in tdesc: short_description = u'%s %s' % (tdesc.split('<')[ 0].strip(', '), tdesc.split('>')[1].split('<')[0]) else: short_description = tdesc.strip(', ') movie = Movie(m['id'], htmlparser.unescape(m['title'])) movie.other_titles = NotLoaded movie.release_date = NotLoaded movie.duration = NotLoaded movie.short_description = htmlparser.unescape(short_description) movie.pitch = NotLoaded movie.country = NotLoaded movie.note = NotLoaded movie.roles = NotLoaded movie.all_release_dates = NotLoaded movie.thumbnail_url = NotLoaded yield movie
def original_unescape(self, s): """Since we need to use this sometimes""" if isinstance(s, basestring): return unicode(HTMLParser.unescape(self, s)) elif isinstance(s, list): return [unicode(HTMLParser.unescape(self, item)) for item in s] else: return s
class IssuesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() def collect_urls(self): recs = [] items = self.get(self.url).findAll("item") for item in items: rec = { "inserted_at": datetime.now(), "title": self.html.unescape(item.title.text), "created_at": parser.parse(item.pubdate.text), "site": "berniesanders.com", "lang": "en", "article_type": "Issues", "description_html": item.description.text, "description": self.html.unescape( BeautifulSoup(item.description.text).p.text), "url": item.link.nextSibling } recs.append(rec) return recs def retrieve(self, rec): soup = self.get(rec["url"]).find("section", {"id": "content"}) while soup.article.style is not None: soup.article.style.extract() rec["body_html"] = str(soup.article) text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == 'br': text.append("") rec["body"] = "\n".join(text) return rec def go(self): urls = self.collect_urls() if not urls: logging.critical("Could not retrieve issues.") sys.exit(1) for url in urls: rec = self.retrieve(url) query = { "title": rec["title"], "article_type": rec["article_type"] } if not self.db.articles.find(query).limit(1).count(): msg = "Inserting '{0}', created {1}" logging.info(msg.format( rec["title"].encode("utf8"), str(rec["created_at"]) )) self.db.articles.insert_one(rec)
def getStatus(self): data = self.queryWebInterface() if data is None: print "Connection error of some sort" return None isplaying = int(data["isPlaying"]) ispaused = int(data["isPaused"]) playback_mode = int(data["playbackOrder"]) if isplaying or ispaused: current_song_id = data["playingItem"] else: #Currently stopped so try and use either the last playing song or the currently focused item if len(data["prevplayedItem"]) > 0: current_song_id = data["prevplayedItem"] else: current_song_id = data["focusedItem"] if current_song_id != "?": current_song_id = int(current_song_id) #Deriving the page ourselves because playlistPage is just whatever page is currently visible, not the page #that our song is actually on. if (data["playlistActive"] == data["playlistPlaying"]) or data["playingItem"] == "?" and current_song_id != "?": current_page = (current_song_id/int(data["playlistItemsPerPage"])) + 1 cur_position_on_page = current_song_id - (current_page-1) * int(data["playlistItemsPerPage"]) current_song_name = data["playlist"][cur_position_on_page]["t"] current_artist = data["playlist"][cur_position_on_page]["a"] try: next_song_in_playlist = data["playlist"][cur_position_on_page+1]["t"] + " - " + data["playlist"][cur_position_on_page+1]["a"] except: next_song_in_playlist = None else: if len(data["helper1"]) > 0: #Not on the correct playlist page, fall back to less reliable helperi fields current_song_name = re.match("^(.*) - $", data["helper1"]).group(1) current_artist = re.search("(.*) - %s" % re.escape(current_song_name), data["helper2"]).group(1) next_song_in_playlist = None else: return None return_data = {} return_data["isplaying"] = isplaying return_data["ispaused"] = ispaused return_data["playback_mode"] = playback_mode #Encountered a problem with the ajquery template returning HTML escape sequences in song/artist names #Hopefully this fixes it h = HTMLParser() return_data["song_name"] = unicode(h.unescape(current_song_name)).encode("utf8") return_data["artist_name"] = unicode(h.unescape(current_artist)).encode("utf8") return_data["next_song_in_playlist"] = next_song_in_playlist return return_data
def escapeit(sval, EXTRAS=None): global _h _h = HTMLParser() # note, xmlescape and unescape do not work with utf-8 bytestrings # so pre-convert to full unicode and then convert back since our result xml is utf-8 encoded uval = sval.decode('utf-8') if EXTRAS: ures = xmlescape(_h.unescape(uval), EXTRAS) else: ures = xmlescape(_h.unescape(uval)) return ures.encode('utf-8')
class ArticlesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/daily/" self.html = HTMLParser() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, r.url if r.status_code == 200: soup = BeautifulSoup(r.text) content = soup.article paragraphs = [self.html.unescape(replace_with_newlines(p)) for p in content.findAll("p")] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html return False, False def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "inserted_at": datetime.now(), "created_at": parser.parse(article.time["datetime"]), "source": "berniesanders.com", "type": "DemocracyDaily", "excerpt_html": str(article.find( "div", {"class": "excerpt"}).p), "excerpt": self.html.unescape( article.find( "div", {"class": "excerpt"}).p.text), "title": article.h2.text, "article_category": article.h1.string.strip(), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] query = {"title": rec["title"], "type": "DemocracyDaily"} if not self.db.articles.find(query).limit(1).count(): text, html = self.retrieve_article(rec["url"]) rec["body"], rec["body_html"] = text, html msg = "Inserting '{0}', created {1}" logging.info(msg.format( rec["title"].encode("utf8"), str(rec["created_at"]) )) self.db.articles.insert_one(rec)
def isHTMLEntity(self, word): if (len(word) == 1): return False, word if (word[0] == '&'): h = HTMLParser() if (h.unescape(word) != '' and self.dict.check(h.unescape(word)) == True): word = unicode.encode(h.unescape(word), 'utf-8') return True, word else: return False, word else: return False, word
def fixEntities(self, metadata, names): # fix the escaped entities or we end up with things like: # '&' in the title htmlparser = HTMLParser() for name in names: value = metadata.get(name) if isinstance(value, list): l = [] for v in value: l.append(htmlparser.unescape(v)) metadata[name] = l elif value: metadata[name] = htmlparser.unescape(value) return metadata
def main(): if len(sys.argv) > 1: query = " ".join(sys.argv[1:]) url = u'http://api.wolframalpha.com/v2/query?input={q}&appid={API_KEY}&format=plaintext'.format(API_KEY = wolfram_alpha_key, q = quote(query)) resp = requests.get(url) for pod in re.findall(r'<pod.+?>.+?</pod>', resp.text, re.S): title = re.findall(r'<pod.+?title=[\'"](.+?)[\'"].*>', pod, re.S) parser = HTMLParser() print(Fore.GREEN + parser.unescape("".join(title).strip()) + Fore.RESET) for inner in re.findall(r'<plaintext>(.*?)</plaintext>', pod, re.S): print(parser.unescape(inner.strip())) print('')
def parse_summary(string, isReversed=False): if string is None: string = '' if type(string) == list: string = str(string) if not isReversed: val = string.split('\n') for i, v in enumerate(val): val[i] = '%s<br/>' % v val[i] = val[i].replace('Step:', '<strong> Step:</strong>') val[i] = val[i].replace('Checkpoint:', '<strong> Checkpoint:</strong>') val[i] = val[i].replace('Verify point:', '<strong> Verify point:</strong>') val[i] = val[i].replace('*TC Steps:*', '<strong> *TC Steps:*</strong>') val[i] = val[i].replace('*VP:*', '<strong> *VP:*</strong>') return ''.join(val) else: ps = HTMLParser() val = string.split('<br/>') for i, v in enumerate(val): val[i] = re.sub(r'\n\t', '', val[i]) val[i] = remove_tags(val[i]) val[i] = ps.unescape(val[i]) val[i] = val[i].encode('ascii', errors='ignore') val = filter(None, val) return '<br/>'.join(val).strip()
def cleaner(dummy, value, *_): """Cleans out unsafe HTML tags. Uses bleach and unescape until it reaches a fix point. Args: dummy: unused, sqalchemy will pass in the model class value: html (string) to be cleaned Returns: Html (string) without unsafe tags. """ # Some cases like Request don't use the title value # and it's nullable, so check for that if value is None: return value if not isinstance(value, basestring): # no point in sanitizing non-strings return value parser = HTMLParser() value = unicode(value) while True: lastvalue = value value = parser.unescape( bleach.clean(value, BLEACH_TAGS, BLEACH_ATTRS, strip=True)) if value == lastvalue: break return value
def get_tags(html, tag_name): parser = HTMLParser() for m in re.findall('<%s(\s+[^>]*)/*>' % tag_name, html, re.IGNORECASE): attrs = {} for x in re.findall('(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE): if x[1]: attrs[x[1]] = parser.unescape(x[2]) elif x[3]: attrs[x[3]] = parser.unescape(x[4]) elif x[5]: attrs[x[5]] = parser.unescape(x[6]) elif x[7]: attrs[x[7]] = parser.unescape(x[7]) yield attrs
def get_imdb_movie_reviews(id,title,year): score_max = 10.0 link = "http://www.imdb.com/title/tt%0.7d/" % id url = web.URL(link) dom = web.DOM(url.download(cached=True)) overall = float(dom.by_class("titlePageSprite star-box-giga-star")[0].content.strip()) / score_max # try to get year directly from page; this isn't present in every entry try: year = dom('span.itemprop[itemprop=name]')[0].next.next.by_tag('a')[0].content year = int(year) except: pass rc = dom.by_attr(itemprop="reviewCount")[0].content.split(" ")[0].replace(",","") revlink = link + 'reviews?count=%s&start=0' % rc # get at most 20 reviews url = web.URL(revlink) dom = web.DOM(url.download(cached=True)) parser = HTMLParser() lst = [] hrs = dom.by_id('tn15main').by_tag('hr') for hr in hrs: div = hr.next.next try: score = float(div.by_tag("img")[1].attrs["alt"].split("/")[0]) / score_max date = div.by_tag("small")[2].content except: continue user = div.by_tag("a")[1].content p = div.next.next review = parser.unescape(p.content.replace("<br />","\n")) lst.append(dict(critic=user,norm_score=score,quote=review, id=id,title=title,source="IMDB",overall_score=overall,year=year,date=date)) return lst
def get_jobs(self): try: jobs_start_time = time.time() h = HTMLParser() html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore') soup = BeautifulSoup(html, 'html.parser') data = soup.findAll('a', id=lambda x: x and x.startswith('popup')) counter = 0 for a in data: if a.has_attr('href'): counter = counter + 1 #self.DrawSpinner(counter) try: return_code = self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1]) if return_code == 1: #In case the error pages starts to come jobs_end_time = time.time() print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time) return except Exception: continue jobs_end_time = time.time() print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time) except Exception as e: print 'exception= ', str(e) #print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno)
def cleaner(dummy, value, *_): """Cleans out unsafe HTML tags. Uses bleach and unescape until it reaches a fix point. Args: dummy: unused, sqalchemy will pass in the model class value: html (string) to be cleaned Returns: Html (string) without unsafe tags. """ # Some cases don't use the title value and it's nullable, so check for that if value is None: return value if not isinstance(value, basestring): # no point in sanitizing non-strings return value parser = HTMLParser() value = unicode(value) while True: lastvalue = value value = parser.unescape( bleach.clean(value, BLEACH_TAGS, BLEACH_ATTRS, strip=True) ) if value == lastvalue: break return value
def tj_striphtml(value): parser = HTMLParser() s = MLStripper() s.feed(parser.unescape(value)) return s.get_data()
def _process_title(title): """处理feed.entries中的title""" html_parser = HTMLParser() title = html_parser.unescape(title) # 进行2次HTML反转义 title = html_parser.unescape(title) title = title.replace('\r', '').replace('\n', '') # 去除换行符 return remove_html_tag(title)
def get(self, response, page, api_type, api_value): channel = { 'page': page, 'page_patten': None, 'movies': [] } response = json.loads(response) movies = response['data'] if 'total' in response and response['total'] > 24: channel['page'] = int(round(response['total']/24)) channel['page_patten'] = '%s|%s' % (api_type, api_value) h = HTMLParser() for movie in movies: type = self.get_quality(int(movie['quality'])) label = "[%s] %s" % (type, movie['name']) if not movie['is_movie']: label = "[%s/%s] %s" % (movie['meta']['max_episode_name'], movie['time'], movie['name']) channel['movies'].append({ 'id': movie['slug'], 'label': label.encode("utf-8"), 'title': movie['name'].encode("utf-8"), 'realtitle': movie['name'].encode("utf-8"), 'thumb': movie['thumbnail'], 'poster': movie['poster'], 'type': type, 'intro': h.unescape(movie['description']) }) return channel
class ArkTweetNLP: def __init__(self, data=[]): # Lookup cache (constantly rerunning tagger takes time) self.cache = Cache('ark_tweet') # Unescape data self.h = HTMLParser() # Resolve and cache all currently uncached tweets self.resolve(data) def normalizeKey(self, tweet): clean = lambda txt: self.h.unescape(txt).strip() try: tmp = tweet.decode('utf-8') except UnicodeEncodeError, e: # Didn't want to resort to this, but get each character one at a time ctmp = [] for c in tweet: try: c.decode('utf-8') ctmp.append(c) except UnicodeEncodeError, e: continue tmp = ''.join(ctmp)
def getMovie(html): hxs = lxml.html.document_fromstring(html) hp = HTMLParser() movie = {} try: movie['title'] = hp.unescape(hxs.xpath('//*[@id="overview-top"]/h1/span[1]/text()')[0].strip()) except IndexError: movie['title'] = "" try: original_title = hxs.xpath('//*[@id="overview-top"]/h1/span[3]/text()')[0].strip() movie['original_title'] = original_title.replace('"', '') except: movie['original_title'] = "" try: movie['type'] = hxs.xpath('//*[@id="overview-top"]/div[1]/text()')[0].strip() except: movie['type'] = "" try: movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[2]/a/text()')[0].strip()) except: try: movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[3]/a/text()')[0].strip()) except: try: movie['year'] = int(hxs.xpath('//*[@id="overview-top"]/h1/span[2]/text()')[0].strip().replace('(', '').replace(')', '')) except: movie['year'] = 0 try: duration = hxs.xpath('//*[@id="overview-top"]/div[2]/time/text()')[0].strip() movie['duration'] = int(duration.replace('min', '').strip()) except: movie['duration'] = 0 try: movie['genres'] = hxs.xpath('//*[@id="overview-top"]/div[2]/a/span/text()') except: movie['genres'] = [] try: movie['release_date'] = hxs.xpath('//*[@id="overview-top"]/div[2]/span[3]/a/text()')[0].strip() except: try: movie['release_date'] = hxs.xpath('//*[@id="overview-top"]/div[2]/span[4]/a/text()')[0].strip() except: movie['release_date'] = "" try: movie['rating'] = float(hxs.xpath('//*[@id="overview-top"]/div[3]/div[3]/strong/span/text()')[0].strip()) except: movie['rating'] = 0 try: movie['poster'] = hxs.xpath('//*[@id="img_primary"]/div/a/img/@src')[0].strip() except: movie['poster'] = "" try: movie['actors'] = hxs.xpath('//*[@id="overview-top"]/div[6]/a/span/text()') except: movie['actors'] = "" return movie
def update_event_description(event_id, description, analyst): """ Update event description. :param event_id: The ObjectId of the Event to update. :type event_id: str :param description: The new description. :type description: str :param analyst: The user updating this Event. :type analyst: str :returns: dict with keys "success" (boolean) and "message" (str) """ if not description: return {'success': False, 'message': "No description to change"} event = Event.objects(id=event_id).first() if not event: return {'success': False, 'message': "No event found"} # Have to unescape the submitted data. Use unescape() to escape # < and friends. Use urllib2.unquote() to escape %3C and friends. h = HTMLParser() description = h.unescape(description) event.description = description try: event.save(username=analyst) return {'success': True} except ValidationError, e: return {'success': False, 'message': e}
def first_selected_option_text(self): if not self.is_patternfly: return text(self.first_selected_option) else: parser = HTMLParser() return parser.unescape( execute_script("return arguments[0].innerHTML;", self.first_selected_option))
def wolfplex(options): # clean events Event.objects.filter(source="wolfplex").delete() html_parser = HTMLParser() soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read()) events = soup.find("div", id="accueil-agenda").dl for date_info, event in zip(events('dt'), events('dd')[1::2]): if event.span: event.span.clear() title = html_parser.unescape(event.text) base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else "" url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org" start = parse(date_info.span["title"]) if "@" in title: title, location = title.split("@", 1) else: location = None Event.objects.create( title=title, source="wolfplex", url=url, start=start, location=location ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
def twitch_lookup(location): locsplit = location.split("/") if len(locsplit) > 1 and len(locsplit) == 3: channel = locsplit[0] type = locsplit[1] # should be b or c id = locsplit[2] else: channel = locsplit[0] type = None id = None h = HTMLParser() fmt = "{}: {} playing {} ({})" # Title: nickname playing Game (x views) if type and id: if type == "b": # I haven't found an API to retrieve broadcast info soup = http.get_soup("http://twitch.tv/" + location) title = soup.find('span', {'class': 'real_title js-title'}).text playing = soup.find('a', {'class': 'game js-game'}).text views = soup.find('span', {'id': 'views-count'}).text + " view" views = views + "s" if not views[0:2] == "1 " else views return h.unescape(fmt.format(title, channel, playing, views)) elif type == "c": data = http.get_json("https://api.twitch.tv/kraken/videos/" + type + id) title = data['title'] playing = data['game'] views = str(data['views']) + " view" views = views + "s" if not views[0:2] == "1 " else views return h.unescape(fmt.format(title, channel, playing, views)) else: data = http.get_json( "http://api.justin.tv/api/stream/list.json?channel=" + channel)[0] if data: title = data['title'] playing = data['meta_game'] viewers = "\x033\x02Online now!\x02\x0f " + str( data["channel_count"]) + " viewer" print viewers viewers = viewers + "s" if not " 1 view" in viewers else viewers print viewers return h.unescape(fmt.format(title, channel, playing, viewers)) else: data = http.get_json("https://api.twitch.tv/kraken/channels/" + channel) title = data['status'] playing = data['game'] viewers = "\x034\x02Offline\x02\x0f" return h.unescape(fmt.format(title, channel, playing, viewers))
def GetNewestCurseData(name, unused_mod): parser = HTMLParser() # Name the project. projectUrl = baseUrl + '/projects/' + str(name) projectUrl = DerefUrl(projectUrl).split('?')[0] # Find the project ID. projectPage = Get(projectUrl) tree = soupparser.fromstring(projectPage) projectID = int( tree.xpath('//li[@class="view-on-curse"]/a/@href')[0].split('/') [-1]) projectTitle = tree.xpath( '//h1[@class="project-title"]//span/text()')[0] # Find the newest copy of the mod. # TODO: Filter by stability, regex, whatever. Add once needed. filesUrl = projectUrl + '/files?filter-game-version=2020709689%3A6170' filesPage = Get(filesUrl) tree = soupparser.fromstring(filesPage) files = tree.xpath( '//div[@class="project-file-name-container"]/a[@class="overflow-tip"]/@href' ) names = tree.xpath( '//div[@class="project-file-name-container"]/a[@class="overflow-tip"]/text()' ) stability = tree.xpath( '//td[@class="project-file-release-type"]/div/@class') assert len(files) == len(names) == len(stability) files_filtered = [] names_filtered = [] for i in xrange(len(files)): if 'alpha' not in stability[i]: files_filtered.append(files[i]) names_filtered.append(names[i]) if files_filtered: files = files_filtered names = names_filtered data = { PROJECTID: projectID, PROJECTPAGE: projectUrl, TITLE: projectTitle, } if files: # Find the URL and MD5 of that file. filePage = Get(baseUrl + files[0]) tree = soupparser.fromstring(filePage) hash = tree.xpath('//span[@class="%s"]/text()' % HASH) url = tree.xpath('//a[@class="button fa-icon-download"]/@href') data[FILENAME] = parser.unescape(names[0]) data[HASH] = hash[0] data[SRC] = baseUrl + url[0] # Find the dependencies for this file. dependencies = [ int(url.split('/')[-1]) for url in tree.xpath( '//*[text()="Required Library"]/following-sibling::ul/li/a/@href' ) ] data[DEPENDENCIES] = dependencies IncProgressbar(data[TITLE]) return FixupData(data)
def twtt2(tweet): h = HTMLParser() for match in re.finditer("&[^\s]*;", tweet): try: tweet = tweet.replace(match.group(), str(h.unescape(match.group()))) except UnicodeDecodeError: continue return tweet
def load_urls_name_page(): connect = urllib2.urlopen("http://listen.jazzradio.com/public3") data = connect.read() p = HTMLParser() data = p.unescape(data) for i in simplejson.loads(data): print "%s = %s" % (i["name"].encode('utf-8'), i["playlist"].encode('utf-8'))
def parse_news(self, news): u_news = news.decode("utf-8") ascii_news=u_news.encode("ascii","ignore") parser = HTMLParser() #Initializes parser parsed_news_text = parser.unescape(ascii_news) #Parses HTML Numeric characters quote_removed_news_text = parsed_news_text.replace('"', '').replace("'", '') #Removes single and double quotes return(quote_removed_news_text)
def test_is_html_escaped(self): """Unescape the escaped response to see if it's the original content""" h = HTMLParser() content = '*****@*****.**' self.assertEqual( h.unescape( self.middleware.process_response( None, HttpResponse(content)).content), content)
def _process_odoo_data(self, odoo_data): # Concatenation of all boxes in one text html_parser = HTMLParser() fields = ('original_text', 'english_text', 'translated_text') for field in fields: if field in odoo_data: odoo_data[field] = html_parser.unescape( BOX_SEPARATOR.join(odoo_data[field]))
def process_html_content(self, line): line_tmp = line.strip() #fixed_backslashes = self.regexbackslash.sub(r"\\\\", line_tmp) parser = HTMLParser() line = parser.unescape(self.unescapeHtmlChars(line)) processed_output = self.strip_tags(line) + "\n" return processed_output.encode("utf-8")
def main(): if len(sys.argv) > 1: query = " ".join(sys.argv[1:]) url = u'http://api.wolframalpha.com/v2/query?input={q}&appid={API_KEY}&format=plaintext'.format( API_KEY=wolfram_alpha_key, q=quote(query)) resp = requests.get(url) for pod in re.findall(r'<pod.+?>.+?</pod>', resp.text, re.S): title = re.findall(r'<pod.+?title=[\'"](.+?)[\'"].*>', pod, re.S) parser = HTMLParser() print(Fore.GREEN + parser.unescape("".join(title).strip()) + Fore.RESET) for inner in re.findall(r'<plaintext>(.*?)</plaintext>', pod, re.S): print(parser.unescape(inner.strip())) print('')
def _create_elem_tree(self, filename): f = open(filename, 'r') h = HTMLParser() unescaped = h.unescape(f.read()) f.close() tree = ElementTree() tree.parse(StringIO(unescaped)) return tree
def twtt2(text): """ Takes in a tweet and replaces html character codes with their ASCII equivalent. """ htmlparser = HTMLParser() #unescape function takes in a html tag and converts it to bytecode #encode takes result and turns it into ascii equivalent return htmlparser.unescape(text).encode('utf-8','ignore')
def Request(query): try: r = requests.get(args.backendurl + query) r.raise_for_status() h = HTMLParser() Send(h.unescape(r.text.strip(' \n\t\r'))) except Exception, e: print e
def get_distribution_metadata(resource_id): # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller' json_dict = get_data_json_contents() html_parser = HTMLParser() json_dict = html_parser.unescape(json_dict) datajson = DataJson(json_dict) dist = datajson.get_distribution(resource_id) return dist
def transform_filename(filename): LOGGER.debug(filename) # first unescape parser = HTMLParser() unescaped = parser.unescape(filename) # replace '/' with unicode U+2215 '∕' new_name = unescaped.replace('/', u'\u2215') return new_name
def get_mail_recipient(self): mail_recipient = '' groups_ids = self._get_group_ids() for record in groups_ids: for records in record.users: mail_recipient = ('' if records.partner_id.email == False else records.partner_id.email + ',') + mail_recipient parser = HTMLParser() return parser.unescape(mail_recipient)
def _to_smart(verse): verse = verse.replace(",`", ", '") verse = verse.replace("`", "'") out = smartypants(verse) parser = HTMLParser() out = parser.unescape(out) return out
def main(): opts = webdriver.ChromeOptions() # opts.binary_location('/Applications/Google Chrome 2.app/Contents/MacOS/Google Chrome') # directly write location in origin code of chromeoption.init for it didn't work on my computer driver = webdriver.Chrome(chrome_options=opts) driver.get("https://www.zhihu.com/question/28481779") def execute_times(times): for i in range(times + 1): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) try: driver.find_element_by_css_selector('button.QuestionMainAction').click() print "page" + str(i) time.sleep(1) except: break execute_times(5) result_raw = driver.page_source result_soup = BeautifulSoup(result_raw, 'html.parser') result_bf = result_soup.prettify() with open("/Users/zhanglei/crawler/zhihu/raw_result.txt", 'w') as girls: girls.write(result_bf) print 'store raw data successfully!' with open("/Users/zhanglei/crawler/zhihu/noscript_meta.txt", 'w') as noscript_meta: noscript_nodes = result_soup.find_all('noscript') noscript_inner_all = "" for noscript in noscript_nodes: noscript_inner = noscript.get_text() noscript_inner_all += noscript_inner + '\n' h = HTMLParser() noscript_all = h.unescape(noscript_inner_all) noscript_meta.write(noscript_all) print 'store noscript meta data successfully!' img_soup = BeautifulSoup(noscript_all, 'html.parser') img_nodes = img_soup.find_all('img') with open("/Users/zhanglei/crawler/zhihu/img_meta.txt", 'w') as img_meta: count = 0 for img in img_nodes: if img.get('src') is not None: img_url = img.get('src') line = str(count) + "\t" + img_url + "\n" img_meta.write(line) urllib.urlretrieve(img_url, "/Users/zhanglei/crawler/zhihu/image/" + str(count) + ".jpg") count += 1 print 'store meta data and image successfully!'
def check_request(self, request, context): h = HTMLParser() unescaped_trimmed_request = h.unescape( request.replace(' ', '').replace('\\n', '').lower()) for item in UnvalidatedRedirectsChecker.FORBIDDEN_RESOURCES: if item in unescaped_trimmed_request: return None return request
def sovet(args, message): sovet = url.urlopen("http://f*****g-great-advice.ru/api/random").read() parser = HTMLParser() params = { 'message': parser.unescape(json.loads(sovet)["text"]).encode('utf8') } params.update(DIALOG_PARAM) return execute_in_vk(SEND_MESSAGE_COMMAND, params)
def html_unescape(html): """html转义字符 逆转""" if type(html) == str: html = html.decode('utf-8') html_parser = HTMLParser() html = html_parser.unescape(html) return html
def normalize_html(self, html): """Strip HTML Tags and normalize gratuitous newlines.""" parser = HTMLParser() results = parser.unescape('\n'.join( ' '.join(line.split()) for line in strip_tags(html).splitlines() if line)) return '\n'.join(filter(bool, results.split('\n\n')))
def send_request(url, command, headers=None, data=None): # Request files don't specify HTTP vs HTTPS, so I'm trying to try both instead of # asking the user. If no http or https, first try http, and if that errors, try # https. Request files can be used for GET's also, so I pull that part out as well. if 'http' not in url: try: http_url = 'http://%s' % url if (data): response = requests.post(http_url, headers=headers, data=data, verify=False) else: response = requests.get(http_url, headers=headers, verify=False) except Exception as error: print error try: https_url = 'https://%s' % url if (data): response = requests.post(https_url, headers=headers, data=data, verify=False) else: response = requests.get(https_url, headers=headers, verify=False) except Exception as error: print error else: try: response = requests.get(url, headers=headers, verify=False) except Exception as error: print "[!] Failed to establish connection" # print error exit() # print response.headers # print response.content match = re.search('([---------------------------------------------------][\n])(.*)', response.content) try: command_output = str(match.group(0)) print '\n{}\nOUTPUT OF: {}\n{}'.format('-' * 30, command, '-' * 30) # print command_output.replace('\\n','\n') command_output = command_output.replace('\\n', '\n') h = HTMLParser() print(h.unescape(command_output)) # print command_output except Exception as error: print "\n[!] Could not found command output. Debug info:\n" print "---------------Response Headers---------------" print response.headers print "---------------Response Content---------------" print response.content return error
def cleanInput(str): if type(str) is not unicode: str = unicode(str, "iso-8859-15") xmlc = re.compile('&#(.+?);', re.DOTALL).findall(str) for c in xmlc: str = str.replace("&#"+c+";", unichr(int(c))) p = HTMLParser() str = p.unescape(str) return str
def execute(): h = HTMLParser() for name, rule in frappe.db.sql( """SELECT name, rule FROM `tabItem Variant Restrictions` WHERE rule is not null""", as_list=1): frappe.db.set_value("Item Variant Restrictions", name, 'rule', h.unescape(rule))
def loadCatemapSF(csvfile): from HTMLParser import HTMLParser htmlparser = HTMLParser() schema = StructType([ StructField("Category", StringType(), False), StructField("Descript", StringType(), False), StructField("count", IntegerType(), False), StructField("New_Class", StringType(), False), ]) df = spark.read.csv(csvfile, header=True, schema=schema) pairs = df.select("Category", "Descript", "New_Class").collect() catemap = {(htmlparser.unescape(s["Category"]), htmlparser.unescape(s["Descript"])): htmlparser.unescape(s["New_Class"]) for s in pairs} return catemap
def getText(textElement): regex = re.compile('\ +') html = HTMLParser() text = regex.sub(' ', textElement) text = text.strip() text = html.unescape(text) return text
def change_ref(text): # 处理网页中的转义序列 datas = re.findall(REF_PATTERN, text) if len(datas) > 0: parser = HTMLParser() datas = set(datas) for data in datas: replace = parser.unescape(data) text = text.replace(data, replace) return text