def parse_html(d): folder_dict = defaultdict(dict) for f in range(0, 500): #print("In file " + str(f)) key = str(d) +"/" + str(f) file = "WEBPAGES_RAW/" + key if os.path.isfile(file): content = [] with codecs.open(file, "r", encoding="utf-8") as data: data = data.read().encode("ascii", "ignore") soup = BeautifulSoup(data) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] [script.extract() for script in soup("script")] [style.extract() for style in soup("style")] content = " ".join(item.strip() for item in soup.findAll(text=True)) content = HTMLParser().unescape(content) content = content.encode("ascii", "ignore") content = content.decode('utf-8') pattern = re.compile('[\W_]+') content = pattern.sub(' ', content).lower().split() for word in content: if len(word) > 1: if word not in folder_dict: folder_dict[word] = defaultdict(int) folder_dict[word][key] +=1 return folder_dict
def programs(): # Request dvr try: req = requests.get("http://www.rtp.pt/play/programas", headers=HEADERS) req.encoding = "latin-1" req = req.text except: raise_notification() match = re.compile(r'<div class="meta-data"><h4>(.+?)</h4>').findall(req) i = 0 for name in match: name = HTMLParser().unescape(kodiutils.compat_py23str(name)) name = name.encode('utf8', 'replace') liz = ListItem(name) addDirectoryItem(handle=plugin.handle, listitem=liz, isFolder=True, url=plugin.url_for(programs_category, name=name, id=i, page=1)) i = i + 1 endOfDirectory(plugin.handle)
def getTweetText(line): tweet = json.loads(line) if tweet['text'].startswith("RT") and 'retweeted_status' in tweet: text = tweet['retweeted_status']['text'] else: text = tweet['text'] text = HTMLParser().unescape(text) text = text.encode('ascii','ignore').decode('utf8','ignore') #decode('unicode_escape').encode('ascii','ignore') return text