def update_text(clickData): if clickData['points'][0]['curveNumber'] == 0: return html.Div([ html.A("Direct link to HackerNews user comment", href="https://news.ycombinator.com/item?id=" + str(hackernews_xamarin_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(hackernews_xamarin_Body_Data[ clickData['points'][0]['pointIndex']])) ]) if clickData['points'][0]['curveNumber'] == 1: return html.Div([ html.A("Direct link to HackerNews user comment", href="https://news.ycombinator.com/item?id=" + str(hackernews_react_native_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(hackernews_react_native_Body_Data[ clickData['points'][0]['pointIndex']])) ]) if clickData['points'][0]['curveNumber'] == 2: return html.Div([ html.A("Direct link to HackerNews user comment", href="https://news.ycombinator.com/item?id=" + str(hackernews_flutter_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(hackernews_flutter_Body_Data[ clickData['points'][0]['pointIndex']])) ])
def update_text(hoverData): if hoverData['points'][0]['curveNumber'] == 0: return HP.HTMLParser().unescape( reddit_xamarin_Body_Data[hoverData['points'][0]['pointIndex']]) if hoverData['points'][0]['curveNumber'] == 1: return HP.HTMLParser().unescape(reddit_react_native_Body_Data[ hoverData['points'][0]['pointIndex']]) if hoverData['points'][0]['curveNumber'] == 2: return HP.HTMLParser().unescape( reddit_flutter_Body_Data[hoverData['points'][0]['pointIndex']])
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) try: import html.parser as html_parser except: import HTMLParser as html_parser txt = html_parser.HTMLParser().unescape(txt) txt = html_parser.HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") txt = txt.strip() return txt
def convert_content(content): html_parser = parser.HTMLParser() content = html_parser.unescape(content) if not disable_convert_code_tag: content = convert_code_tag(content) return content
def trombi(): h = parser.HTMLParser() filters = "" method = request.method error, session, params = log_and_check_params( ["token", "location", "year"], request) if error != {}: return json.dumps(error), error['error']['code'] try: for param in params: if param != "login" and param != "password": filters = filters + "&%s=%s" % (param, params[param]) r = session.post(server_url + "/user/filter/user?format=json" + filters, verify=ssl_verify, cookies={'language': 'fr'}) if r.status_code == 403: return json.dumps({ "error": { "message": "Connection token is invalid or has expired", 'code': 403 } }), 403 return clean_json(r.text) except Exception as e: return json.dumps({ "error": { "message": "Server was unable to connect to Epitech's intra API", "code": 500 } }), 500
def find_tv_show_season(content, tvshow, season): url_found = None found_urls = [] possible_matches = [] all_tvshows = [] h = HTMLParser.HTMLParser() for matches in re.finditer(movie_season_pattern, content, re.IGNORECASE | re.DOTALL): found_title = matches.group('title') found_title = h.unescape(found_title) if matches.group('link') in found_urls: continue log(__name__, "Found tv show season on search page: %s" % found_title) found_urls.append(matches.group('link')) s = difflib.SequenceMatcher(None, (found_title + ' ' + matches.group('year')).lower(), tvshow.lower()) all_tvshows.append(matches.groups() + (s.ratio() * int(matches.group('numsubtitles')),)) if found_title.lower().find(tvshow.lower() + " ") > -1: if found_title.lower().find(season.lower()) > -1: log(__name__, "Matching tv show season found on search page: %s" % found_title) possible_matches.append(matches.groups()) if len(possible_matches) > 0: possible_matches = sorted(possible_matches, key=lambda x: -int(x[3])) url_found = possible_matches[0][0] log(__name__, "Selecting matching tv show with most subtitles: %s (%s)" % ( possible_matches[0][1], possible_matches[0][3])) else: if len(all_tvshows) > 0: all_tvshows = sorted(all_tvshows, key=lambda x: -int(x[4])) url_found = all_tvshows[0][0] log(__name__, "Selecting tv show with highest fuzzy string score: %s (score: %s subtitles: %s)" % ( all_tvshows[0][1], all_tvshows[0][4], all_tvshows[0][3])) return url_found
def es_ingest_objects(self): ingest_list = [] parser = htmlparser.HTMLParser() es_dict_template = {"_index": "netflix_crossing", "_type": "nfobject"} title_list = [] for nfobject in self.nfobjects: ingest_dict = es_dict_template.copy() nfobject["title"] = parser.unescape(nfobject["title"]) nfobject["synopsis"] = parser.unescape(nfobject["synopsis"]) title_list.append(nfobject["title"]) ingest_dict.update(nfobject) ingest_list.append(ingest_dict) helpers.bulk(self.es, ingest_list) print("Objects ingested") print("Title list") print(title_list)
def replaceEscapeCodes(txt): try: import html.parser as html_parser except: import HTMLParser as html_parser txt = html_parser.HTMLParser().unescape(txt) return txt
def find_by_title(self, title): default_find_by_title_params = {'json': '1', 'nr': 1, 'tt': 'on', 'q': title} query_params = urlencode(default_find_by_title_params) results = self.get(('http://www.imdb.com/' 'xml/find?{0}').format(query_params)) keys = ['title_popular', 'title_exact', 'title_approx', 'title_substring'] title_results = [] html_unescape = htmlparser.HTMLParser().unescape # Loop through all results and build a list with popular matches first for key in keys: if key in results: for r in results[key]: year = None year_match = re.search(r'(\d{4})', r['title_description']) if year_match: year = year_match.group(0) title_match = { 'title': html_unescape(r['title']), 'year': year, 'imdb_id': r['id'] } title_results.append(title_match) return title_results
def restoreIni(self): #prjFileInfo = QtCore.QFileInfo(core.QgsProject.instance().fileName()) #iniFileInfo = QtCore.QFileInfo(os.path.join(prjFileInfo.path(),prjFileInfo.baseName()+".gsv")) stored_settings = core.QgsExpressionContextUtils.projectScope(core.QgsProject.instance()).variable('go2sv_infolayer_settings') if stored_settings: self.infoBoxIni = json.loads(stored_settings) self.loadPointLayers(default = self.infoBoxIni["infoLayer"]) self.infoField.setText(self.infoBoxIni["infoField"]) else: self.infoBoxIni = {'infoLayerEnabled': None,'infoBoxTemplate': u'','infoField': '','infoBoxEnabled': None,'iconPath': '','infoLayer': '','distanceBuffer':'100',"mapCommandsEnabled":None} self.loadPointLayers() if self.infoBoxIni["infoLayerEnabled"]: self.enableInfoLayerCheckbox.setCheckState(QtCore.Qt.Checked) else: self.enableInfoLayerCheckbox.setCheckState(QtCore.Qt.Unchecked) self.iconPath.setText(self.infoBoxIni["iconPath"]) self.distanceBuffer.setText(self.infoBoxIni["distanceBuffer"]) if self.infoBoxIni["infoBoxEnabled"]: self.enableInfoBoxCheckbox.setCheckState(QtCore.Qt.Checked) else: self.enableInfoBoxCheckbox.setCheckState(QtCore.Qt.Unchecked) if self.infoBoxIni["mapCommandsEnabled"]: self.mapCommandsCheck.setCheckState(QtCore.Qt.Checked) else: self.enableInfoBoxCheckbox.setCheckState(QtCore.Qt.Unchecked) html_parser = HTMLParser.HTMLParser() self.infoboxHtml.setPlainText(html_parser.unescape(self.infoBoxIni["infoBoxTemplate"])) self.enableInfoLayerAction(True) if self.infoIndex and self.enableInfoLayerCheckbox.isChecked(): self.updateSpatialIndex() self.defined.emit()
def getDocFromNode(ns, node, retval=None): annotation_node = node.find('xs:annotation', ns) if annotation_node is None: return retval documentation_node = annotation_node.find('xs:documentation', ns) if documentation_node is None: return retval # Be sure to grab _all_ content in the <xs:documentation> node. # In the documentation nodes, use XML entities ("<"" instead of "<") # for documentation characters that would otherwise be considered as XML. s = lxml.etree.tostring(documentation_node, method="text", pretty_print=True) rst = s.decode().lstrip('\n') # remove any leading blank lines rst = rst.rstrip() # remove any trailing white space text = textwrap.dedent(rst) # remove common leading space # substitute HTML entities in markup: "<" for "<" # thanks: http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string try: # see #661 import html text = html.unescape(text) except (ImportError, AttributeError): from html import parser as HTMLParser htmlparser = HTMLParser.HTMLParser() text = htmlparser.unescape(text) return text.lstrip()
def __init__(self, *args, **kwargs): self.htmlparser = HTMLParser.HTMLParser() super(Flagger, self).__init__(*args, **kwargs) self.logger = logging.getLogger(__name__) self.logger.setLevel((self.debug or self.verbose) and logging.DEBUG or logging.ERROR) self.now = int(time.time())
def fillRaiSportKeys(self): # search for items in main menu RaiSportKeys = [] try: data = utils.checkStr(urllib2.urlopen(self.RaiSportMainUrl).read()) except urllib2.HTTPError: data = '' m = re.search("<a href=\"javascript:void\(0\)\">Menu</a>(.*?)</div>", data, re.S) if not m: return [] menu = m.group(0) links = re.findall("<a href=\"(?P<url>[^\"]+)\">(?P<title>[^<]+)</a>", menu) good_links = [] for l in links: if ('/archivio.html?' in l[0]) and not ('&' in l[0]): good_links.append({'title': l[1], 'url': l[0]}) good_links.append({ 'title': self.RaiPlayAddonHandle.getLocalizedString(32015), 'url': '/archivio.html?tematica=altri-sport' }) # open any single page in list and grab search keys for l in good_links: try: data = utils.checkStr( urllib2.urlopen(self.RaiSportMainUrl + l['url']).read()) except urllib2.HTTPError: data = '' dataDominio = re.findall("data-dominio=\"(.*?)\"", data) dataTematica = re.findall("data-tematica=\"(.*?)\"", data) xbmc.log(str(dataTematica)) if dataTematica: if len(dataTematica) > 1: del (dataTematica[0]) try: title = dataTematica[0].split('|')[0] title = utils.checkStr( HTMLParser.HTMLParser().unescape(title)) params = { 'title': title, 'dominio': dataDominio[0], 'sub_keys': dataTematica } RaiSportKeys.append(params) except: xbmc.log("error in key %s" % str(dataTematica)) return RaiSportKeys
def xml_constructor(self, soup, link, tpburl, info): page = HTMLParser(soup) if info[0] == "search": try: title = page.title except: title = info[1] elif info[0] in ["browse", "user"]: try: title = parser.HTMLParser().unescape(search('<title>(.*) - TPB</title>', soup).group(1)) except: title = info[1] elif info[0] == "recent": title = "Recent Torrents" xml = "<rss version=\"2.0\">\n\t<channel>\n\t\t" xml += "<title>TPB2RSS: %s</title>\n\t\t" % title xml += "<link>%s%s</link>\n\t\t" % (tpburl, parse.quote(link)) xml += "<description>The Pirate Bay %s feed for \"%s\"</description>\n\t\t" % (info[0], title) xml += "<lastBuildDate>%s GMT</lastBuildDate>\n\t\t" % datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S") xml += "<language>en-us</language>\n\t\t" xml += "<generator>TPB2RSS %s</generator>\n\t\t" % __version__ xml += "<docs>%s</docs>\n\t\t" % __docs__ xml += "<webMaster>%s (%s)</webMaster>" % (__email__, __author__) position = 0 for i in range(int(len(page.data) / 4)): item = str(page.data[position + 1]).split("\"") seeders = str(str(page.data[position + 2]).split(">")[1]).split("<")[0] leechers = str(str(page.data[position + 3]).split(">")[1]).split("<")[0] category = sub(r"(\n|\t)", "", (compile(r'<.*?>').sub('', page.data[0]).replace("(", " ("))) xml += self.item_constructor(item, seeders, leechers, category, tpburl) position += 4 xml += "\n\t</channel>\n</rss>" return xml
def trim_cmt(cmt_list): import html.parser as HTMLParser html_parser = HTMLParser.HTMLParser() pattern = re.compile("<.*>.*?<.*>", re.S) return map( lambda x: re.sub(pattern, "", html_parser.unescape(x[0])).strip(), cmt_list)
def translate_text(text, lang): """ Takes in input text and utilizes Google's Translate API to convert it to the selected language. """ translate_client = translate.Client() # Takes the input language and returns the string abbreviation for the translate file # This resolves the Google API invalid input error when passing the lang parameter. langDict = { 'en': 'English', 'es': 'Spanish', 'it': 'Italian', 'fr': 'French', 'tr': 'Turkish', 'ko': 'Korean' } language = '' for abbrev, fullLanguage in langDict.items(): if fullLanguage == langDict[lang]: language = abbrev if isinstance(text, six.binary_type): text = text.decode('utf-8') # Initialize parser to convert special characters back to their original form after # Google translation API has finished translating the text. parser = htmlparser.HTMLParser() result = translate_client.translate(text, target_language=language) return parser.unescape(result['translatedText'])
def find_movie(content, title, year): found_urls = {} found_movies = [] h = HTMLParser.HTMLParser() for secmatches in re.finditer(search_section_pattern, content, re.IGNORECASE | re.DOTALL): log(__name__, secmatches.group('section')) for matches in re.finditer(movie_season_pattern, secmatches.group('content'), re.IGNORECASE | re.DOTALL): if matches.group('link') in found_urls: if secmatches.group('section') == 'close': found_movies[found_urls[matches.group('link')]]['is_close'] = True if secmatches.group('section') == 'exact': found_movies[found_urls[matches.group('link')]]['is_exact'] = True continue found_urls[matches.group('link')] = len(found_movies) found_title = matches.group('title') found_title = h.unescape(found_title) log(__name__, "Found movie on search page: %s (%s)" % (found_title, matches.group('year'))) found_movies.append( {'t': found_title.lower(), 'y': int(matches.group('year')), 'is_exact': secmatches.group('section') == 'exact', 'is_close': secmatches.group('section') == 'close', 'l': matches.group('link'), 'c': int(matches.group('numsubtitles'))}) year = int(year) title = title.lower() # Priority 1: matching title and year for movie in found_movies: if movie['t'].find(title) > -1: if movie['y'] == year: log(__name__, "Matching movie found on search page: %s (%s)" % (movie['t'], movie['y'])) return movie['l'] # Priority 2: matching title and one off year for movie in found_movies: if movie['t'].find(title) > -1: if movie['y'] == year + 1 or movie['y'] == year - 1: log(__name__, "Matching movie found on search page (one off year): %s (%s)" % (movie['t'], movie['y'])) return movie['l'] # Priority 3: "Exact" match according to search result page close_movies = [] for movie in found_movies: if movie['is_exact']: log(__name__, "Using 'Exact' match: %s (%s)" % (movie['t'], movie['y'])) return movie['l'] if movie['is_close']: close_movies.append(movie) # Priority 4: "Close" match according to search result page if len(close_movies) > 0: close_movies = sorted(close_movies, key=itemgetter('c'), reverse=True) log(__name__, "Using 'Close' match: %s (%s)" % (close_movies[0]['t'], close_movies[0]['y'])) return close_movies[0]['l'] return None
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) if version_info[0] == 3: txt = HTMLParser.unescape(txt) else: txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", '"') txt = txt.replace("&", "&") return txt
def title_echo(r, line, bot, chan): """Echo the title of a url via MC""" def write_url(title, url): """Write a URL to the database""" conn = sqlite3.connect(DB_FILE) c = conn.cursor() t = (title, url) c.execute('INSERT INTO url_history VALUES (?, ?)', t) conn.commit() conn.close() # get url and build opener with custom user-agent and cookies enabled url = r.group(5).split()[0] cookie_jar = http.cookiejar.CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cookie_jar)) opener.addheaders = [('User-agent', USER_AGENT)] # open the url but only read a maximum of 2**20 bytes in case someone is # screwing with us try: f = opener.open(str(url)) html = f.read(1048576) # only read the first 2**20 bytes except: e = traceback.format_exc() debug(e, log_only=True) write_url(None, url) return # uncompress if the data is gzipped try: encoding = f.info()['content-encoding'] except KeyError: encoding = None if encoding and encoding == 'gzip': html = io.BytesIO(html) gz = gzip.GzipFile(fileobj=html, mode='rb') html = gz.read() gz.close f.close() # decode the html and search for the title element html = html.decode('utf-8', errors='replace') title = re.search(r'<title.*?>(.*?)</title>', html, re.DOTALL | re.IGNORECASE) if title: title = title.group(1).strip() title = title.replace('\n', '').replace('\r', '') title = ' '.join([w for w in title.split(' ') if w != '']) title = htmlp.HTMLParser().unescape(title) else: write_url(None, url) return # if we are here then there's a title so echo it to the channel bot.write('PRIVMSG {chan} :Title: {msg}\r\n'.format(chan=chan, msg=title)) write_url(title, url)
def create_block_info(input_path, preprocessor): """Create block info.""" results = [] html_parser = parser.HTMLParser() with tf.io.gfile.GFile(input_path) as input_file: for line in input_file: results.extend( wiki_preprocessor.example_from_json_line( line, html_parser, preprocessor)) return results
def fix_scripts(dom): # ldjson workaround p = HTMLParser.HTMLParser() for script in dom.getElementsByTagName("script"): r = RawText() r.ownerDocument = dom r.data = p.unescape(script.childNodes[0].wholeText) for cn in script.childNodes: script.removeChild(cn) script.appendChild(r)
def speeches(): f = open('speeches.txt', 'r') text_model = markovify.Text(f.read()) tweet = text_model.make_short_sentence(100) + ' ' + random.choice(hashtags) print('Posting to Twitter...') try: api.update_status(HTMLParser.HTMLParser().unescape(tweet)) except ImportError: api.update_status(htmlparser.HTMLParser().unescape(tweet)) f.close() print('Tweet you posted: ' + tweet)
def get_info(): ''' This is a generator that takes titles and descriptions, reformats the html and yields the title and description of the latest videos for Alexa to respond with ''' for tit, des in zip(titles, descriptions): titled = tit.a.img["alt"].strip() title = parser.HTMLParser().unescape(titled) description = des.text.strip() yield f"{title}. {description}"
def decode_value(data): new_data = {} for k, v in data.items(): val = bytes(v.replace('%', '=').replace("+", " "), 'UTF-8') val_decode_str = decodestring(val).decode('UTF-8') val_decode_str = decodestring(val_decode_str).decode('UTF-8') if "&#" in val_decode_str: parser = hlmtparser.HTMLParser() val_decode_str = parser.unescape(val_decode_str) new_data[k] = val_decode_str return new_data
def handle(self, *args, **options): from tendenci.apps.pages.models import Page pages = Page.objects.all() self.h = html_parser.HTMLParser() pattern = re.compile(r'(&#\d+;)', re.IGNORECASE) for page in pages: page.title = re.sub(pattern, self.unescape, page.title) page.content = re.sub(pattern, self.unescape, page.content) page.save()
def update_text(clickData): if clickData['points'][0]['curveNumber'] == 0: return html.Div([ html.A( "Direct link to Reddit user comment", href="https://new.reddit.com/comments/" + str(reddit_xamarin_link_Id_Data[ clickData['points'][0]['pointIndex']]).replace("t3_", "") + "/_/" + str(reddit_xamarin_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(reddit_xamarin_Body_Data[ clickData['points'][0]['pointIndex']])) ]) if clickData['points'][0]['curveNumber'] == 1: return html.Div([ html.A( "Direct link to Reddit user comment", href="https://new.reddit.com/comments/" + str(reddit_react_native_link_Id_Data[ clickData['points'][0]['pointIndex']]).replace("t3_", "") + "/_/" + str(reddit_react_native_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(reddit_react_native_Body_Data[ clickData['points'][0]['pointIndex']])) ]) if clickData['points'][0]['curveNumber'] == 2: return html.Div([ html.A( "Direct link to Reddit user comment", href="https://new.reddit.com/comments/" + str(reddit_flutter_link_Id_Data[ clickData['points'][0]['pointIndex']]).replace("t3_", "") + "/_/" + str(reddit_flutter_Id_Data[clickData['points'][0] ['pointIndex']]), target="_blank"), html.H3(HP.HTMLParser().unescape(reddit_flutter_Body_Data[ clickData['points'][0]['pointIndex']])) ])
def decode_html_entities(s): """ Replaces html entities with the character they represent. >>> print(decode_html_entities("<3 &")) <3 & """ parser = HTMLParser.HTMLParser() def unesc(m): return parser.unescape(m.group()) return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
def load_result(self, runtime): values = { 'problem_id': str(runtime.pid), 'user_id': self.username, 'language': '-1', 'jresult': '-1' } url = self.result_url.format(urllib.parse.urlencode(values)) result_regex = re.compile( self.result_regex_r.format(pid=runtime.pid, name=self.username)) while True: self.set_status('Waiting for judging...') time.sleep(1) html, resp = self.get(url, self.headers) match = result_regex.search(html) result = match.group(4) if result not in [ 'Pending', 'Pending_Rejudging', 'Compiling', 'Running_&_Judging' ]: break match = re.search( self.detail_regex_r.format(pid=runtime.pid, name=self.username), html) message = 'Loading result...' self.set_status(message) result = result.replace('_', ' ') if result.endswith('Exceed'): result = result + 'ed' judge_id = match.group(2) memory = match.group(6) + ' KB' time_ = match.group(7) + ' ms' detail = [result, time_, memory] if result == 'Compile Error': html, resp = self.get(self.compile_message_url.format(judge_id), self.headers) runtime.judge_compile_message = \ parser.HTMLParser().unescape(self.compile_message_regex.findall(html)[0]).replace('\r', '') runtime.judge_detail = [detail] runtime.judge_result = result runtime.judge_score = 100 if result == 'Accepted' else 0
def download_subtitles(self, suburls): """Download the SAMI subtitles, decode the HTML entities and save to temp directory. Return a list of the path to the downloaded subtitles.""" paths = [] for sub_data in suburls: sami = self.make_request(url=sub_data['href'], method='get').decode('utf-8', 'ignore').strip() htmlparser = HTMLParser.HTMLParser() subtitle = htmlparser.unescape(sami).encode('utf-8') path = os.path.join(self.tempdir, '{0}.sami'.format(sub_data['languageCode'])) with open(path, 'wb') as subfile: subfile.write(subtitle) paths.append(path) return paths
def _render_cell(self, column, cell, model, iter_, destroy): article = model[iter_][0] title = escape( re.sub('\\s+', ' ', article.title.replace('\n', ' ').strip())) content = [] html_parser = parser.HTMLParser() html_parser.handle_data = content.append html_parser.feed(article.description) content = escape( re.sub('\\s+', ' ', ''.join(content)[:1000].replace('\n', ' ').strip())) cell.set_property( 'markup', '<big>%s</big>\n<small>%s</small>' % (('%s' if article.read else '<b>%s</b>') % title, content))