Example #1
0
def google_news (self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query
    
           
    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)
    
    links = description.findAll('a')
    for link in links:
        link.extract()          
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()
    
    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".")!=-1:
        description = description[0:description.rfind(".")+1]
    link = tools.shorten_url(newest_news.getElementsByTagName('link')[0].childNodes[0].data)
    
    e.output = "%s - %s [ %s ]" % (title.encode("utf-8", 'ignore'), description.encode("utf-8", 'ignore'), link.encode("utf-8", 'ignore'))
    
    return e
Example #2
0
def google_news(self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query

    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(
        newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]
    link = tools.shorten_url(
        newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    e.output = "%s - %s [ %s ]" % (title.encode(
        "utf-8", 'ignore'), description.encode(
            "utf-8", 'ignore'), link.encode("utf-8", 'ignore'))

    return e
Example #3
0
def crawlPost(link_id):
    exceptions = ["ipTRACKERonline.com"]
    core_url = "http://forum.419eater.com/forum/"
    page_name = "viewtopic.php"
    forum_args = "?t="
    response = None
    try:
        post = urllib2.urlopen(core_url + page_name + forum_args + link_id)
        soup = BeautifulSoup(post, convertEntities=BeautifulSoup.HTML_ENTITIES)
        try:  # hating BeautifulSoup
            response = soup.findAll("td", {"class": "postbody"})[1]
            response = response.renderContents()
            for elem in exceptions:
                if elem in response:
                    return None
            response = response.replace("<br />\n<br />\n", "<magic>\n")
            response = response.replace("<br />\n", "")
            response = response.replace("<magic>\n", "<br />\n")
            response = BeautifulSoup(response)
            response = response.findAll(text=True)
            response = "".join(response)
            response = response.encode("ascii", "ignore")
            response = response.replace("\r\n ", "\r\n")
            response = response.replace("\n\n  Quote:   ", "")
        except:
            return None
    except:
        print "Failing on:", core_url + page_name + forum_args + link_id
    return response  # WARNING: Unicode
def get_sent_tokens(text):
    text = BeautifulSoup(text.encode('utf-8').decode('ascii', 'ignore')).text
    sents = sent_tokenize(str(text))
    sent_tokens = [None] * len(sents)
    for i, sent in enumerate(sents):
        sent_tokens[i] = get_tokens(sent)
    return sent_tokens
Example #5
0
def parse_petharbor_table(content, category=None, batch_id=None):

    # cleanup html
    bs = BS(content)

    parser = html5lib.HTMLParser(
        tokenizer=sanitizer.HTMLSanitizer,
        tree=treebuilders.getTreeBuilder("beautifulsoup"))

    # build parse tree
    tree = parser.parse(bs.encode())
    print '======='

    results = tree.findAll("table", attrs={'class': 'ResultsTable'})[0]

    ret = []

    for row in results.findAll('tr')[1:]:
        columns = row.findAll('td')
        cells = map(lambda x: x.encodeContents(), columns)
        # Split Name and ID
        name, code = parse_cond_parens(cells[1], reverse=True)
        # Split gender and spayed_or_neutered
        gender, spayed_or_neutered = parse_cond_parens(cells[2])
        ret.append({
            'petharbor_url':
            "http://petharbor.com/%s" % columns[0].find('a')['href'],
            'petharbor_img_thumb_url':
            columns[0].find('img')['src'],
            'name':
            capitalize_first(name) or code,
            'code':
            code,
            'gender':
            gender,
            'spayed_or_neutered':
            spayed_or_neutered,
            'main_color':
            cells[3],
            'breed':
            cells[4],
            'age':
            parse_petharbor_age(cells[5]),
            'brought_to_shelter':
            cells[6],
            'located_at':
            cells[7],
            'category':
            category,
            'batch_id':
            batch_id,
            'last_checked':
            datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        })

    return ret
def unescape_xhtml(s):
	htmlInput = '<html>' + s
	# Livejournal stream has &apos; so we must use XHTML_ENTITIES
	unescaped = BeautifulSoup(
		htmlInput, convertEntities=BeautifulSoup.XHTML_ENTITIES
	).contents[0].string
	if not unescaped:
		unescaped = u""
	# Convert BeautifulSoup thing into real str
	return unescaped.encode("utf-8")
Example #7
0
    def getVideos(self):  # blog --> "someblog.tumblr.com"
        i = 0
        posts = client.posts(self.b_name + ".tumblr.com",
                             limit=20,
                             type='video')['posts']
        while i < 20:
            blood = True
            if blood == True:
                # #check if video is already in database
                # cursor.execute('''SELECT * FROM videos WHERE player=%s''', posts[i]['player'][0]['embed_code'])
                # rows = cursor.fetchall()

                # #this database insertion should be moved to the end of recursiveReblogger
                # if not any(posts[i]['player'][0]['embed_code'] in row for row in rows):
                # 	cursor.execute('''INSERT into videos(url, player, slug) VALUES("url placeholder", %s, %s)''', (posts[i]['player'][0]['embed_code'], posts[i]['slug']))
                # 	con.commit()

                #create the video object
                embed_code = posts[i]['player'][0]['embed_code']
                post_url = posts[i]['post_url']
                da_format = posts[i]['format']
                #for slug: replace hyphens with spaces, ntlk tokenize
                slug = nltk.word_tokenize(posts[i]['slug'].encode(
                    'ascii', errors='ignore').replace('-', ' '))
                #for caption: use beautiful soup to get ONLY text in p tag, tokenize
                a = posts[i]['caption'].encode('ascii', errors='ignore')
                caption = []
                if a != "":
                    b = BeautifulSoup(a)('p')[-1].extract().string
                    if b != None:
                        b.encode('ascii', errors='ignore')
                        caption = caption + nltk.word_tokenize(b)
                tags = posts[i]['tags']
                tag_holder = [str(t) for t in tags]
                #store the video objects in the unvisited_posts list
                self.unvisited_posts.insert(
                    0,
                    video(embed_code, post_url, da_format,
                          text_information(slug, caption, tag_holder)))
            else:
                print(str(i) + ": this video didn't work")
            i = i + 1
Example #8
0
	def search(self, keywords):
		url = "http://skreemr.com/results.jsp?q=%s&l=10" % urllib2.quote(" ".join(keywords))
		info_re = re.compile(r"<td>mp3\s+-\s+(?P<rate>.+)\s+-\s+(?P<duration>.+)\s+-\s+(?P<size>.+)&nbsp;<", re.IGNORECASE)
		t = BeautifulSoup(get_content(url)).findAll('table')
		if len(t) < 3:
			return
		for t in BeautifulSoup(get_content(url)).findAll('table')[2].findAll('table'):
			if t.object:
				r = Result()
				r.name = t.h2.a.string
				r.url = urllib2.unquote(t.object.find('param', { 'name' : 'FlashVars' })['value'].split('soundFile=')[1])
				r.__dict__.update(info_re.search(t.encode('utf8')).groupdict())
				yield r
		return
Example #9
0
print "Crawling Pages, please wait..."
with tqdm(total=retrieveLimit) as progress:
    for page in urlList:
        if docIDCounter > retrieveLimit:
            break  #quits crawling if retrieval limit is reached
        try:
            #---------- Page Crawler (gets words and links from each page ---------
            soup = ""
            browse.open(page)
            if page.endswith(".txt"):
                soup = browse.response().read()
            else:
                soup = BeautifulSoup(browse.response().read(
                ))  #if can't parse, assumed to be binary file or 404
                soup = soup.getText()
            hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest()
            if hashTest not in duplicateDetect:
                duplicateDetect.append(hashTest)
                wordsInPage = soup.split()
                if not page.endswith(".txt"):

                    for link in browse.links():
                        tempURL = urlparse.urljoin(link.base_url, link.url)
                        #BELOW: gets rid of duplicate urls resulting from index.html/index.htm
                        if tempURL.endswith("index.html"):
                            tempURL = tempURL.replace("index.html", "")
                        elif tempURL.endswith("index.htm"):
                            tempURL = tempURL.replace("index.htm", "")

                        if tempURL not in urlList:
                            if tempURL.startswith(baseUrl):
def get_tokens(text):
    text = BeautifulSoup(text.encode('utf-8').decode('ascii', 'ignore')).text
    return word_tokenize(str(text).lower())
Example #11
0
    def prep_thread(self, thread):
        # board, url, no ###
        thread['board'] = self.board
        thread['url'] = 'http://boards.4chan.org/{0}/thread/{1}'\
                        .format(self.board, thread['no'])
        thread['no'] = str(thread['no'])
        ####################

        # text ############
        thread['text'] = 'No text available'
        if 'name' in thread:
            thread['text'] = thread['name']
        if 'sub' in thread:
            thread['text'] = thread['sub']
        if 'com' in thread:
            thread['text'] = thread['com']
        s = BeautifulSoup(thread['text']).getText()
        pars = HTMLParser()
        s = pars.unescape(s)
        thread['text'] = s.encode('utf8')
        ###################

        # age_s ###########
        thread['age_s'] = (datetime.now() -
                        datetime.fromtimestamp(thread['time'])).seconds
        ###################

        # age_hm ##########
        age = relativedelta(datetime.now(),
                            datetime.fromtimestamp(thread['time']))
        if age.hours:
            if age.minutes > 9:
                thread['age_hm'] ='{0}:{1}h'\
                    .format(age.hours, age.minutes)
            else:
                thread['age_hm'] ='{0}:0{1}h'\
                    .format(age.hours,age.minutes)
        else:
            thread['age_hm'] = '{0}min'.format(age.minutes)
        ###################

        # rpm #############
        if thread['age_s'] is not 0:
            thread['rpm'] = float("%.1f" % (float(thread['replies']) * 60/
                                            float(thread['age_s'])))
        else:
            thread['rpm'] = 0
        ###################


        if 'filename' in thread:
            if thread['ext'] in ['.jpg', '.png']:
                thread['imgurl'] = ('https://i.4cdn.org/{0}/{1}{2}'
                                    .format(thread['board'],
                                            thread['tim'],
                                            thread['ext']))

        thread['formatted'] =(
            '*{0}/min ({1}r in {2})*\n{3}\n\n{4}\n\n(from {5})\n{6}'
            .format(thread['rpm'],
                    thread['replies'],
                    thread['age_hm'],
                    thread['imgurl'] if 'imgurl' in thread else '',
                    thread['text'],
                    thread['country_name'],
                    thread['url']))

        return thread
Example #12
0
    def prep_thread(self, thread):
        # board, url, no ###
        thread['board'] = self.board
        thread['url'] = 'http://boards.4chan.org/{0}/thread/{1}'\
                        .format(self.board, thread['no'])
        thread['no'] = str(thread['no'])
        ####################

        # text ############
        thread['text'] = 'No text available'
        if 'name' in thread:
            thread['text'] = thread['name']
        if 'sub' in thread:
            thread['text'] = thread['sub']
        if 'com' in thread:
            thread['text'] = thread['com']
        s = BeautifulSoup(thread['text']).getText()
        pars = HTMLParser()
        s = pars.unescape(s)
        thread['text'] = s.encode('utf8')
        ###################

        # age_s ###########
        thread['age_s'] = (datetime.now() -
                           datetime.fromtimestamp(thread['time'])).seconds
        ###################

        # age_hm ##########
        age = relativedelta(datetime.now(),
                            datetime.fromtimestamp(thread['time']))
        if age.hours:
            if age.minutes > 9:
                thread['age_hm'] ='{0}:{1}h'\
                    .format(age.hours, age.minutes)
            else:
                thread['age_hm'] ='{0}:0{1}h'\
                    .format(age.hours,age.minutes)
        else:
            thread['age_hm'] = '{0}min'.format(age.minutes)
        ###################

        # rpm #############
        if thread['age_s'] is not 0:
            thread['rpm'] = float(
                "%.1f" %
                (float(thread['replies']) * 60 / float(thread['age_s'])))
        else:
            thread['rpm'] = 0
        ###################

        if 'filename' in thread:
            if thread['ext'] in ['.jpg', '.png']:
                thread['imgurl'] = ('https://i.4cdn.org/{0}/{1}{2}'.format(
                    thread['board'], thread['tim'], thread['ext']))

        thread['formatted'] = (
            '*{0}/min ({1}r in {2})*\n{3}\n\n{4}\n\n(from {5})\n{6}'.format(
                thread['rpm'], thread['replies'], thread['age_hm'],
                thread['imgurl'] if 'imgurl' in thread else '', thread['text'],
                thread['country_name'], thread['url']))

        return thread
Example #13
0
	bullet_list = bullet_str.split('</li>\n<li>')
	for pre_bullet in bullet_list:
		bullet = BeautifulSoup(pre_bullet).text
		writer.writerow([element['id'],'fertillization_info',bullet])
	#product check list info
	bullet_str = element['product_checklist_info']
	bullet_list = bullet_str.split('</span></li>  <li>')
	for pre_bullet in bullet_list:
		bullet = BeautifulSoup(pre_bullet).text
		writer.writerow([element['id'],'product_checklist_info',bullet])
	#attributes #ascii
	bullet_str = element['attributes']
	bullet_list = bullet_str.split('</li><li>')
	for pre_bullet in bullet_list:
		bullet = BeautifulSoup(pre_bullet).text
		writer.writerow([element['id'],'attributes',bullet.encode('utf8')])
	#pruning_info
	bullet_str = element['pruning_info']
	bullet_list = bullet_str.split('</li>\n<li>')
	for pre_bullet in bullet_list:
		bullet = BeautifulSoup(pre_bullet).text
		writer.writerow([element['id'],'pruning_info',bullet])

			
	#others
	writer.writerow([element['id'],'growth_rate',element['growth_rate']])
	writer.writerow([element['id'],'light',element['light']])
	writer.writerow([element['id'],'water_wise',element['water_wise']])
	writer.writerow([element['id'],'spacing',element['spacing']])
	writer.writerow([element['id'],'deer_resistant',element['deer_resistant']])
	writer.writerow([element['id'],'botanical_name',element['botanical_name'].encode('utf8')])
Example #14
0
print "Crawling Pages, please wait..."
with tqdm(total=retrieveLimit) as progress:
    for page in urlList:
        if docIDCounter > retrieveLimit:
            break #quits crawling if retrieval limit is reached
        try:
            #---------- Page Crawler (gets words and links from each page ---------
            soup = ""
            browse.open(page)
            if page.endswith(".txt"):
                soup = browse.response().read()
            else:
                soup = BeautifulSoup(browse.response().read()) #if can't parse, assumed to be binary file or 404
                soup = soup.getText()
            hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest()
            if hashTest not in duplicateDetect:
                duplicateDetect.append(hashTest)
                wordsInPage = soup.split()
                if not page.endswith(".txt"):

                    for link in browse.links():
                        tempURL = urlparse.urljoin(link.base_url, link.url)
                        #BELOW: gets rid of duplicate urls resulting from index.html/index.htm
                        if tempURL.endswith("index.html"):
                            tempURL = tempURL.replace("index.html", "")
                        elif tempURL.endswith("index.htm"):
                            tempURL = tempURL.replace("index.htm", "")


                        if tempURL not in urlList:
Example #15
0
    def __init__(self, params):
        import re
        from addon import Addon
        from addondict import AddonDict as XBMCDict
        from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment

        a = Addon()
        site = self.__module__
        mode = params['mode']

        home_url = 'http://playporn.to/'
        search_url = home_url + '?submit=Search&s='
        movies_url = home_url + 'category/xxx-movie-stream/'
        scenes_url = home_url + 'category/xxx-clips-scenes-stream/'
        false_positives = ['http://playporn.to/deutsche-milfs-anonym-sex/']

        if mode == 'main':
            item_list = [{
                'site': site,
                'mode': 'list',
                'title': a.language(30003),
                'content': '',
                'url': home_url,
                'cover_url': a.image('recent.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'sub',
                'title': a.language(30001),
                'content': '',
                'url': movies_url,
                'cover_url': a.image('movies.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'sub',
                'title': a.language(30002),
                'content': '',
                'url': scenes_url,
                'cover_url': a.image('scenes.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'list',
                'title': a.language(30004),
                'content': 'search',
                'url': search_url,
                'cover_url': a.image('search.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }]
            item_list.extend(a.favs_hist_menu(site))
            item_list.extend(a.extended_menu())
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'sub':
            item_list = [{
                'site': site,
                'mode': 'list',
                'title': a.language(30006),
                'content': '',
                'url': params['url'],
                'cover_url': a.image('all.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'category',
                'title': a.language(30005),
                'content': '',
                'url': home_url,
                'cover_url': a.image('categories.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }]
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'category':
            index = 1
            if 'scenes' in params['url'].lower(): index = 2
            html = a.get_page(home_url)
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer('ul', 'nav fl'))
            item_list = []
            for item in soup.findAll('ul')[index].findAll({'a': True}):
                item_list.extend([{
                    'site': 'playporn',
                    'mode': 'list',
                    'url': item.get('href'),
                    'content': '',
                    'title': item.contents[0].encode('UTF-8'),
                    'cover_url': a.image(image, image),
                    'backdrop_url': a.art(),
                    'type': 3
                }])
            if item_list:
                a.add_items(item_list)
                a.end_of_directory()

        elif mode == 'list':
            if params.get('content', '') == 'search':
                item = a.search_input()
                if item:
                    params['url'] = search_url + item
                else:
                    exit(1)
            elif params.get('content', '') == 'goto':
                last_item = re.search('/page/([0-9]+)/', params['url'])
                if last_item:
                    last_item = int(last_item.group(1))
                else:
                    last_item = 10000
                item = a.page_input(last_item)
                if item:
                    params['url'] = re.sub('/page/[0-9]+/',
                                           '/page/' + str(item) + '/',
                                           params['url'])
                else:
                    exit(1)
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('body'))
            item_list = []
            params['mode'] = 'play'
            params['content'] = 'movies'
            params['type'] = 0
            params['context'] = 0
            params['duration'] = '7200'
            xbmcdict = XBMCDict(0).update(params)
            for item in soup.findAll('div', 'photo-thumb-image'):
                if not item.a.get('href') in false_positives:
                    _dict = xbmcdict.copy()
                    if 'scenes' in params['url']:
                        _dict['duration'] = '2700'
                        _dict['content'] = 'episodes'
                    _dict['url'] = item.a.get('href')
                    _dict['title'] = item.a.get('title').encode('UTF-8')
                    _dict['tvshowtitle'] = _dict['title']
                    _dict['originaltitle'] = _dict['title']
                    _dict['cover_url'] = a.image(item.img.get('src'))
                    _dict['thumb_url'] = _dict['cover_url']
                    _dict['poster'] = _dict['cover_url']
                    _dict['sub_site'] = site

                    item_list.extend([_dict])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', 'more_entries'))
            if soup:
                item = soup.find('a', 'previouspostslink')
                if item:
                    item_list.extend([{
                        'site':
                        site,
                        'mode':
                        'list',
                        'url':
                        item.get('href'),
                        'content':
                        params['content'],
                        'title':
                        a.language(30017, True),
                        'cover_url':
                        a.image('previous.png', image),
                        'backdrop_url':
                        a.art(),
                        'type':
                        3
                    }])
                item = soup.find('a', 'nextpostslink')
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': params['content'],
                        'title': a.language(30018, True),
                        'cover_url': a.image('next.png', image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
                item = soup.find('a', 'last')
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': 'goto',
                        'title': a.language(30019, True),
                        'cover_url': a.image('goto.png', image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
            if item_list:
                a.add_items(item_list)
                a.end_of_directory()

        elif mode == 'play':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'id': 'loopedSlider'}))
            soup = soup.find(text=lambda text: isinstance(text, Comment))
            if soup:
                soup = re.sub('&lt;', '<', soup.encode('utf-8'))
                soup = re.sub('&gt;', '>', soup)
                soup = BeautifulSoup(soup,
                                     parseOnlyThese=SoupStrainer(
                                         'div', 'video'))
                if soup:
                    item_list = []
                    xbmcdict = XBMCDict(0).update(params)
                    for item in soup.findAll('iframe'):
                        _dict = xbmcdict.copy()
                        _dict['url'] = item.get('src').replace(
                            'http://playporn.to/stream/all/?file=',
                            '').encode('UTF-8')
                        if 'flashx.tv' in _dict['url'].lower():
                            item = re.search('hash=(.+?)&', _dict['url'])
                            if item:
                                _dict[
                                    'url'] = 'http://flashx.tv/video/' + item.group(
                                        1) + '/'
                        elif 'played.to' in _dict['url'].lower():
                            item = re.search('embed-([a-zA-Z0-9]+?)-.+?html',
                                             _dict['url'])
                            if item:
                                _dict[
                                    'url'] = 'http://played.to/' + item.group(
                                        1)
                        item_list.extend([_dict])
                    if item_list:
                        from playback import Playback
                        Playback().choose_sources(item_list)
                    else:
                        a.alert(a.language(30904, True), sound=False)
                else:
                    a.alert(a.language(30904, True), sound=False)
            else:
                a.alert(a.language(30904, True), sound=False)