def google_news (self, e): query = urllib.quote(e.input) url = "" if not query: url = "http://news.google.com/news?ned=us&topic=h&output=rss" else: url = "http://news.google.com/news?q=%s&output=rss" % query dom = xml.dom.minidom.parse(urllib2.urlopen(url)) newest_news = dom.getElementsByTagName('item')[0] title = newest_news.getElementsByTagName('title')[0].childNodes[0].data description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data) links = description.findAll('a') for link in links: link.extract() links = description.findAll(color='#6f6f6f') for link in links: link.extract() description = str(description).strip().decode("utf-8", 'ignore') description = tools.remove_html_tags(description) description = tools.decode_htmlentities(description) description = description[0:len(description) - 9] if description.rfind(".")!=-1: description = description[0:description.rfind(".")+1] link = tools.shorten_url(newest_news.getElementsByTagName('link')[0].childNodes[0].data) e.output = "%s - %s [ %s ]" % (title.encode("utf-8", 'ignore'), description.encode("utf-8", 'ignore'), link.encode("utf-8", 'ignore')) return e
def google_news(self, e): query = urllib.quote(e.input) url = "" if not query: url = "http://news.google.com/news?ned=us&topic=h&output=rss" else: url = "http://news.google.com/news?q=%s&output=rss" % query dom = xml.dom.minidom.parse(urllib2.urlopen(url)) newest_news = dom.getElementsByTagName('item')[0] title = newest_news.getElementsByTagName('title')[0].childNodes[0].data description = BeautifulSoup( newest_news.getElementsByTagName('description')[0].childNodes[0].data) links = description.findAll('a') for link in links: link.extract() links = description.findAll(color='#6f6f6f') for link in links: link.extract() description = str(description).strip().decode("utf-8", 'ignore') description = tools.remove_html_tags(description) description = tools.decode_htmlentities(description) description = description[0:len(description) - 9] if description.rfind(".") != -1: description = description[0:description.rfind(".") + 1] link = tools.shorten_url( newest_news.getElementsByTagName('link')[0].childNodes[0].data) e.output = "%s - %s [ %s ]" % (title.encode( "utf-8", 'ignore'), description.encode( "utf-8", 'ignore'), link.encode("utf-8", 'ignore')) return e
def crawlPost(link_id): exceptions = ["ipTRACKERonline.com"] core_url = "http://forum.419eater.com/forum/" page_name = "viewtopic.php" forum_args = "?t=" response = None try: post = urllib2.urlopen(core_url + page_name + forum_args + link_id) soup = BeautifulSoup(post, convertEntities=BeautifulSoup.HTML_ENTITIES) try: # hating BeautifulSoup response = soup.findAll("td", {"class": "postbody"})[1] response = response.renderContents() for elem in exceptions: if elem in response: return None response = response.replace("<br />\n<br />\n", "<magic>\n") response = response.replace("<br />\n", "") response = response.replace("<magic>\n", "<br />\n") response = BeautifulSoup(response) response = response.findAll(text=True) response = "".join(response) response = response.encode("ascii", "ignore") response = response.replace("\r\n ", "\r\n") response = response.replace("\n\n Quote: ", "") except: return None except: print "Failing on:", core_url + page_name + forum_args + link_id return response # WARNING: Unicode
def get_sent_tokens(text): text = BeautifulSoup(text.encode('utf-8').decode('ascii', 'ignore')).text sents = sent_tokenize(str(text)) sent_tokens = [None] * len(sents) for i, sent in enumerate(sents): sent_tokens[i] = get_tokens(sent) return sent_tokens
def parse_petharbor_table(content, category=None, batch_id=None): # cleanup html bs = BS(content) parser = html5lib.HTMLParser( tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("beautifulsoup")) # build parse tree tree = parser.parse(bs.encode()) print '=======' results = tree.findAll("table", attrs={'class': 'ResultsTable'})[0] ret = [] for row in results.findAll('tr')[1:]: columns = row.findAll('td') cells = map(lambda x: x.encodeContents(), columns) # Split Name and ID name, code = parse_cond_parens(cells[1], reverse=True) # Split gender and spayed_or_neutered gender, spayed_or_neutered = parse_cond_parens(cells[2]) ret.append({ 'petharbor_url': "http://petharbor.com/%s" % columns[0].find('a')['href'], 'petharbor_img_thumb_url': columns[0].find('img')['src'], 'name': capitalize_first(name) or code, 'code': code, 'gender': gender, 'spayed_or_neutered': spayed_or_neutered, 'main_color': cells[3], 'breed': cells[4], 'age': parse_petharbor_age(cells[5]), 'brought_to_shelter': cells[6], 'located_at': cells[7], 'category': category, 'batch_id': batch_id, 'last_checked': datetime.datetime.now().strftime('%Y%m%d%H%M%S') }) return ret
def unescape_xhtml(s): htmlInput = '<html>' + s # Livejournal stream has ' so we must use XHTML_ENTITIES unescaped = BeautifulSoup( htmlInput, convertEntities=BeautifulSoup.XHTML_ENTITIES ).contents[0].string if not unescaped: unescaped = u"" # Convert BeautifulSoup thing into real str return unescaped.encode("utf-8")
def getVideos(self): # blog --> "someblog.tumblr.com" i = 0 posts = client.posts(self.b_name + ".tumblr.com", limit=20, type='video')['posts'] while i < 20: blood = True if blood == True: # #check if video is already in database # cursor.execute('''SELECT * FROM videos WHERE player=%s''', posts[i]['player'][0]['embed_code']) # rows = cursor.fetchall() # #this database insertion should be moved to the end of recursiveReblogger # if not any(posts[i]['player'][0]['embed_code'] in row for row in rows): # cursor.execute('''INSERT into videos(url, player, slug) VALUES("url placeholder", %s, %s)''', (posts[i]['player'][0]['embed_code'], posts[i]['slug'])) # con.commit() #create the video object embed_code = posts[i]['player'][0]['embed_code'] post_url = posts[i]['post_url'] da_format = posts[i]['format'] #for slug: replace hyphens with spaces, ntlk tokenize slug = nltk.word_tokenize(posts[i]['slug'].encode( 'ascii', errors='ignore').replace('-', ' ')) #for caption: use beautiful soup to get ONLY text in p tag, tokenize a = posts[i]['caption'].encode('ascii', errors='ignore') caption = [] if a != "": b = BeautifulSoup(a)('p')[-1].extract().string if b != None: b.encode('ascii', errors='ignore') caption = caption + nltk.word_tokenize(b) tags = posts[i]['tags'] tag_holder = [str(t) for t in tags] #store the video objects in the unvisited_posts list self.unvisited_posts.insert( 0, video(embed_code, post_url, da_format, text_information(slug, caption, tag_holder))) else: print(str(i) + ": this video didn't work") i = i + 1
def search(self, keywords): url = "http://skreemr.com/results.jsp?q=%s&l=10" % urllib2.quote(" ".join(keywords)) info_re = re.compile(r"<td>mp3\s+-\s+(?P<rate>.+)\s+-\s+(?P<duration>.+)\s+-\s+(?P<size>.+) <", re.IGNORECASE) t = BeautifulSoup(get_content(url)).findAll('table') if len(t) < 3: return for t in BeautifulSoup(get_content(url)).findAll('table')[2].findAll('table'): if t.object: r = Result() r.name = t.h2.a.string r.url = urllib2.unquote(t.object.find('param', { 'name' : 'FlashVars' })['value'].split('soundFile=')[1]) r.__dict__.update(info_re.search(t.encode('utf8')).groupdict()) yield r return
print "Crawling Pages, please wait..." with tqdm(total=retrieveLimit) as progress: for page in urlList: if docIDCounter > retrieveLimit: break #quits crawling if retrieval limit is reached try: #---------- Page Crawler (gets words and links from each page --------- soup = "" browse.open(page) if page.endswith(".txt"): soup = browse.response().read() else: soup = BeautifulSoup(browse.response().read( )) #if can't parse, assumed to be binary file or 404 soup = soup.getText() hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest() if hashTest not in duplicateDetect: duplicateDetect.append(hashTest) wordsInPage = soup.split() if not page.endswith(".txt"): for link in browse.links(): tempURL = urlparse.urljoin(link.base_url, link.url) #BELOW: gets rid of duplicate urls resulting from index.html/index.htm if tempURL.endswith("index.html"): tempURL = tempURL.replace("index.html", "") elif tempURL.endswith("index.htm"): tempURL = tempURL.replace("index.htm", "") if tempURL not in urlList: if tempURL.startswith(baseUrl):
def get_tokens(text): text = BeautifulSoup(text.encode('utf-8').decode('ascii', 'ignore')).text return word_tokenize(str(text).lower())
def prep_thread(self, thread): # board, url, no ### thread['board'] = self.board thread['url'] = 'http://boards.4chan.org/{0}/thread/{1}'\ .format(self.board, thread['no']) thread['no'] = str(thread['no']) #################### # text ############ thread['text'] = 'No text available' if 'name' in thread: thread['text'] = thread['name'] if 'sub' in thread: thread['text'] = thread['sub'] if 'com' in thread: thread['text'] = thread['com'] s = BeautifulSoup(thread['text']).getText() pars = HTMLParser() s = pars.unescape(s) thread['text'] = s.encode('utf8') ################### # age_s ########### thread['age_s'] = (datetime.now() - datetime.fromtimestamp(thread['time'])).seconds ################### # age_hm ########## age = relativedelta(datetime.now(), datetime.fromtimestamp(thread['time'])) if age.hours: if age.minutes > 9: thread['age_hm'] ='{0}:{1}h'\ .format(age.hours, age.minutes) else: thread['age_hm'] ='{0}:0{1}h'\ .format(age.hours,age.minutes) else: thread['age_hm'] = '{0}min'.format(age.minutes) ################### # rpm ############# if thread['age_s'] is not 0: thread['rpm'] = float("%.1f" % (float(thread['replies']) * 60/ float(thread['age_s']))) else: thread['rpm'] = 0 ################### if 'filename' in thread: if thread['ext'] in ['.jpg', '.png']: thread['imgurl'] = ('https://i.4cdn.org/{0}/{1}{2}' .format(thread['board'], thread['tim'], thread['ext'])) thread['formatted'] =( '*{0}/min ({1}r in {2})*\n{3}\n\n{4}\n\n(from {5})\n{6}' .format(thread['rpm'], thread['replies'], thread['age_hm'], thread['imgurl'] if 'imgurl' in thread else '', thread['text'], thread['country_name'], thread['url'])) return thread
def prep_thread(self, thread): # board, url, no ### thread['board'] = self.board thread['url'] = 'http://boards.4chan.org/{0}/thread/{1}'\ .format(self.board, thread['no']) thread['no'] = str(thread['no']) #################### # text ############ thread['text'] = 'No text available' if 'name' in thread: thread['text'] = thread['name'] if 'sub' in thread: thread['text'] = thread['sub'] if 'com' in thread: thread['text'] = thread['com'] s = BeautifulSoup(thread['text']).getText() pars = HTMLParser() s = pars.unescape(s) thread['text'] = s.encode('utf8') ################### # age_s ########### thread['age_s'] = (datetime.now() - datetime.fromtimestamp(thread['time'])).seconds ################### # age_hm ########## age = relativedelta(datetime.now(), datetime.fromtimestamp(thread['time'])) if age.hours: if age.minutes > 9: thread['age_hm'] ='{0}:{1}h'\ .format(age.hours, age.minutes) else: thread['age_hm'] ='{0}:0{1}h'\ .format(age.hours,age.minutes) else: thread['age_hm'] = '{0}min'.format(age.minutes) ################### # rpm ############# if thread['age_s'] is not 0: thread['rpm'] = float( "%.1f" % (float(thread['replies']) * 60 / float(thread['age_s']))) else: thread['rpm'] = 0 ################### if 'filename' in thread: if thread['ext'] in ['.jpg', '.png']: thread['imgurl'] = ('https://i.4cdn.org/{0}/{1}{2}'.format( thread['board'], thread['tim'], thread['ext'])) thread['formatted'] = ( '*{0}/min ({1}r in {2})*\n{3}\n\n{4}\n\n(from {5})\n{6}'.format( thread['rpm'], thread['replies'], thread['age_hm'], thread['imgurl'] if 'imgurl' in thread else '', thread['text'], thread['country_name'], thread['url'])) return thread
bullet_list = bullet_str.split('</li>\n<li>') for pre_bullet in bullet_list: bullet = BeautifulSoup(pre_bullet).text writer.writerow([element['id'],'fertillization_info',bullet]) #product check list info bullet_str = element['product_checklist_info'] bullet_list = bullet_str.split('</span></li> <li>') for pre_bullet in bullet_list: bullet = BeautifulSoup(pre_bullet).text writer.writerow([element['id'],'product_checklist_info',bullet]) #attributes #ascii bullet_str = element['attributes'] bullet_list = bullet_str.split('</li><li>') for pre_bullet in bullet_list: bullet = BeautifulSoup(pre_bullet).text writer.writerow([element['id'],'attributes',bullet.encode('utf8')]) #pruning_info bullet_str = element['pruning_info'] bullet_list = bullet_str.split('</li>\n<li>') for pre_bullet in bullet_list: bullet = BeautifulSoup(pre_bullet).text writer.writerow([element['id'],'pruning_info',bullet]) #others writer.writerow([element['id'],'growth_rate',element['growth_rate']]) writer.writerow([element['id'],'light',element['light']]) writer.writerow([element['id'],'water_wise',element['water_wise']]) writer.writerow([element['id'],'spacing',element['spacing']]) writer.writerow([element['id'],'deer_resistant',element['deer_resistant']]) writer.writerow([element['id'],'botanical_name',element['botanical_name'].encode('utf8')])
print "Crawling Pages, please wait..." with tqdm(total=retrieveLimit) as progress: for page in urlList: if docIDCounter > retrieveLimit: break #quits crawling if retrieval limit is reached try: #---------- Page Crawler (gets words and links from each page --------- soup = "" browse.open(page) if page.endswith(".txt"): soup = browse.response().read() else: soup = BeautifulSoup(browse.response().read()) #if can't parse, assumed to be binary file or 404 soup = soup.getText() hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest() if hashTest not in duplicateDetect: duplicateDetect.append(hashTest) wordsInPage = soup.split() if not page.endswith(".txt"): for link in browse.links(): tempURL = urlparse.urljoin(link.base_url, link.url) #BELOW: gets rid of duplicate urls resulting from index.html/index.htm if tempURL.endswith("index.html"): tempURL = tempURL.replace("index.html", "") elif tempURL.endswith("index.htm"): tempURL = tempURL.replace("index.htm", "") if tempURL not in urlList:
def __init__(self, params): import re from addon import Addon from addondict import AddonDict as XBMCDict from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment a = Addon() site = self.__module__ mode = params['mode'] home_url = 'http://playporn.to/' search_url = home_url + '?submit=Search&s=' movies_url = home_url + 'category/xxx-movie-stream/' scenes_url = home_url + 'category/xxx-clips-scenes-stream/' false_positives = ['http://playporn.to/deutsche-milfs-anonym-sex/'] if mode == 'main': item_list = [{ 'site': site, 'mode': 'list', 'title': a.language(30003), 'content': '', 'url': home_url, 'cover_url': a.image('recent.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'sub', 'title': a.language(30001), 'content': '', 'url': movies_url, 'cover_url': a.image('movies.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'sub', 'title': a.language(30002), 'content': '', 'url': scenes_url, 'cover_url': a.image('scenes.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'list', 'title': a.language(30004), 'content': 'search', 'url': search_url, 'cover_url': a.image('search.png', image), 'backdrop_url': a.art(), 'type': 3 }] item_list.extend(a.favs_hist_menu(site)) item_list.extend(a.extended_menu()) a.add_items(item_list) a.end_of_directory() elif mode == 'sub': item_list = [{ 'site': site, 'mode': 'list', 'title': a.language(30006), 'content': '', 'url': params['url'], 'cover_url': a.image('all.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'category', 'title': a.language(30005), 'content': '', 'url': home_url, 'cover_url': a.image('categories.png', image), 'backdrop_url': a.art(), 'type': 3 }] a.add_items(item_list) a.end_of_directory() elif mode == 'category': index = 1 if 'scenes' in params['url'].lower(): index = 2 html = a.get_page(home_url) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('ul', 'nav fl')) item_list = [] for item in soup.findAll('ul')[index].findAll({'a': True}): item_list.extend([{ 'site': 'playporn', 'mode': 'list', 'url': item.get('href'), 'content': '', 'title': item.contents[0].encode('UTF-8'), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) if item_list: a.add_items(item_list) a.end_of_directory() elif mode == 'list': if params.get('content', '') == 'search': item = a.search_input() if item: params['url'] = search_url + item else: exit(1) elif params.get('content', '') == 'goto': last_item = re.search('/page/([0-9]+)/', params['url']) if last_item: last_item = int(last_item.group(1)) else: last_item = 10000 item = a.page_input(last_item) if item: params['url'] = re.sub('/page/[0-9]+/', '/page/' + str(item) + '/', params['url']) else: exit(1) html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('body')) item_list = [] params['mode'] = 'play' params['content'] = 'movies' params['type'] = 0 params['context'] = 0 params['duration'] = '7200' xbmcdict = XBMCDict(0).update(params) for item in soup.findAll('div', 'photo-thumb-image'): if not item.a.get('href') in false_positives: _dict = xbmcdict.copy() if 'scenes' in params['url']: _dict['duration'] = '2700' _dict['content'] = 'episodes' _dict['url'] = item.a.get('href') _dict['title'] = item.a.get('title').encode('UTF-8') _dict['tvshowtitle'] = _dict['title'] _dict['originaltitle'] = _dict['title'] _dict['cover_url'] = a.image(item.img.get('src')) _dict['thumb_url'] = _dict['cover_url'] _dict['poster'] = _dict['cover_url'] _dict['sub_site'] = site item_list.extend([_dict]) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', 'more_entries')) if soup: item = soup.find('a', 'previouspostslink') if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': params['content'], 'title': a.language(30017, True), 'cover_url': a.image('previous.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item = soup.find('a', 'nextpostslink') if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': params['content'], 'title': a.language(30018, True), 'cover_url': a.image('next.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item = soup.find('a', 'last') if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': 'goto', 'title': a.language(30019, True), 'cover_url': a.image('goto.png', image), 'backdrop_url': a.art(), 'type': 3 }]) if item_list: a.add_items(item_list) a.end_of_directory() elif mode == 'play': html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', {'id': 'loopedSlider'})) soup = soup.find(text=lambda text: isinstance(text, Comment)) if soup: soup = re.sub('<', '<', soup.encode('utf-8')) soup = re.sub('>', '>', soup) soup = BeautifulSoup(soup, parseOnlyThese=SoupStrainer( 'div', 'video')) if soup: item_list = [] xbmcdict = XBMCDict(0).update(params) for item in soup.findAll('iframe'): _dict = xbmcdict.copy() _dict['url'] = item.get('src').replace( 'http://playporn.to/stream/all/?file=', '').encode('UTF-8') if 'flashx.tv' in _dict['url'].lower(): item = re.search('hash=(.+?)&', _dict['url']) if item: _dict[ 'url'] = 'http://flashx.tv/video/' + item.group( 1) + '/' elif 'played.to' in _dict['url'].lower(): item = re.search('embed-([a-zA-Z0-9]+?)-.+?html', _dict['url']) if item: _dict[ 'url'] = 'http://played.to/' + item.group( 1) item_list.extend([_dict]) if item_list: from playback import Playback Playback().choose_sources(item_list) else: a.alert(a.language(30904, True), sound=False) else: a.alert(a.language(30904, True), sound=False) else: a.alert(a.language(30904, True), sound=False)