Exemple #1
0
def parse_html(d):
    folder_dict = defaultdict(dict)
    for f in range(0, 500):
        #print("In file " + str(f))
        key = str(d) +"/" + str(f)
        file = "WEBPAGES_RAW/" + key
        
        if os.path.isfile(file):
            content = []
            with codecs.open(file, "r", encoding="utf-8") as data:
                data = data.read().encode("ascii", "ignore")
                soup = BeautifulSoup(data)
                comments = soup.findAll(text=lambda text:isinstance(text, Comment))
                [comment.extract() for comment in comments] 
                [script.extract() for script in soup("script")]
                [style.extract() for style in soup("style")] 
                content = " ".join(item.strip() for item in soup.findAll(text=True))
                content = HTMLParser().unescape(content)
                content = content.encode("ascii", "ignore")
                content = content.decode('utf-8')

            pattern = re.compile('[\W_]+')
            content = pattern.sub(' ', content).lower().split()
  
            for word in content:
                if len(word) > 1:
                    if word not in folder_dict:
                        folder_dict[word] = defaultdict(int)
                    folder_dict[word][key] +=1
    return folder_dict
Exemple #2
0
def programs():
    # Request dvr
    try:
        req = requests.get("http://www.rtp.pt/play/programas", headers=HEADERS)
        req.encoding = "latin-1"
        req = req.text
    except:
        raise_notification()

    match = re.compile(r'<div class="meta-data"><h4>(.+?)</h4>').findall(req)

    i = 0
    for name in match:
        name = HTMLParser().unescape(kodiutils.compat_py23str(name))
        name = name.encode('utf8', 'replace')
        liz = ListItem(name)
        addDirectoryItem(handle=plugin.handle,
                         listitem=liz,
                         isFolder=True,
                         url=plugin.url_for(programs_category,
                                            name=name,
                                            id=i,
                                            page=1))
        i = i + 1

    endOfDirectory(plugin.handle)
Exemple #3
0
def getTweetText(line):
    tweet = json.loads(line)
    if tweet['text'].startswith("RT") and 'retweeted_status' in tweet:
        text = tweet['retweeted_status']['text']
    else:
        text = tweet['text']
    
    text = HTMLParser().unescape(text)
    text = text.encode('ascii','ignore').decode('utf8','ignore') #decode('unicode_escape').encode('ascii','ignore')
    return text