def get_first_working_feed_link(url):
    """
        Try to use the current URL as a feed. If it works, returns it.
        It it doesn't, load the HTML and try to get links from it then
        test them one by one and returns the first one that works.

        >>> get_first_working_feed_link('http://www.codinghorror.com/blog/')
        u'http://feeds.feedburner.com/codinghorror/'
        >>> get_first_working_feed_link('http://feeds.feedburner.com/codinghorror/')
        u'http://feeds.feedburner.com/codinghorror/'
    """

    # if the url is a feed itself, returns it
    html = urllib2.urlopen(url).read(1000000)
    feed = feedparser.parse(html)
    
    if not feed.get("bozo", 1):
        return unicode(url)

    # construct the site url from the domain name and the protocole name    
    parsed_url = urllib2.urlparse.urlparse(url)
    site_url = u"%s://%s" % (parsed_url.scheme, parsed_url.netloc)
    
    # parse the html extracted from the url, and get all the potiential
    # links from it then try them one by one
    for link in extract_feed_links(html):
        if '://' not in link: # if we got a relative URL, make it absolute 
            link = site_url + link
        feed = feedparser.parse(link)
        if not feed.get("bozo", 1):
            return link

    return None
    def getFeed(self, url):
        """Fetch a feed.

        This may return a cached result if the cache entry is considered to
        be fresh. Returned feeds have been cleaned using the cleanFeed method.
        """
        now=time.time()

        chooser=getUtility(ICacheChooser)
        cache=chooser("collective.portlet.feedmixer.FeedCache")
        cached_data=cache.get(url, None)
        cache_timeout = int(self.cache_timeout)
        if cached_data is not None:
            (timestamp, feed)=cached_data
            if now-timestamp<cache_timeout:
                return feed

            newfeed=feedparser.parse(url,
                    etag=getattr(feed, "etag", None),
                    modified=getattr(feed, "modified", None))
            if newfeed.status==304:
                self.cleanFeed(feed)
                cache[url]=(now+cache_timeout, feed)
                return feed

        feed=feedparser.parse(url)
        self.cleanFeed(feed)
        cache[url]=(now+cache_timeout, feed)

        return feed
    def test_empty_guid(self, get):
        get.return_value = responses(304)

        parsed = feedparser.parse(data_file('no-guid.xml'))
        data = list(filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        ))
        feed = FeedFactory.create(user__ttl=99999)
        with self.assertNumQueries(2):
            store_entries(feed.url, data)

        [entry] = es.manager.user(feed.user).fetch(annotate=feed.user)['hits']
        self.assertTrue(entry.guid)

        entry.delete()

        parsed = feedparser.parse(data_file('no-link-guid.xml'))
        data = list(filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]  # noqa
        ))
        feed = FeedFactory.create(user__ttl=99999)
        with self.assertNumQueries(2):
            store_entries(feed.url, data)
        [entry] = es.manager.user(feed.user).fetch()['hits']
        self.assertTrue(entry.guid)
Example #4
0
def check_github():
    old_version = {}
    for repo in repo_names:
        old_version[repo] = feedparser.parse(
            'https://github.com/' + account_name +
            '/' + repo + '/commits/' + branch + '.atom'
            )

    time.sleep(SLEEP_SECONDS)  # Wait then compare

    for repo in repo_names:
        new = feedparser.parse('https://github.com/' + account_name +
                               '/' + repo + '/commits/' + branch + '.atom')
        try:
            if new.entries[0] != old_version[repo].entries[0]:
                #author = new.entries[0].author.split()[0]  # First name
                author = new.entries[0].author_detail.href.split('/')[-1]
                commit_msg = new.entries[0].title
                print '\n'
                print "[" + repo + "] " + author + ": " + commit_msg
                print '\n'
                irc_msg("[" + repo + "] " + author + ": " + commit_msg)
        except:
            print "GitHub f****d up, I think. Here's what they gave us:"
            print new
Example #5
0
def test_classify_from_feed():
     rssParser = RssFeedParser()
     rssUrl_ny = 'http://newyork.craigslist.org/stp/index.rss'
     rssUrl_sf = 'http://sfbay.craigslist.org/stp/index.rss'
     rss_ny = feedparser.parse(rssUrl_ny)
     rss_sf = feedparser.parse(rssUrl_sf)
     vocabList,pSF,pNY = rssParser.localWords(rss_ny,rss_sf)
    def getFeed(self, url):
        """Fetch a feed.
        """
        now = time.time()

        chooser = getUtility(ICacheChooser)
        cache = chooser("observatorio.tema.FeedCache")

        cached_data = cache.get(url, None)
        if cached_data is not None:
            (timestamp, feed) = cached_data
            if now-timestamp < self.data.get('cache_timeout'):
                return feed

            newfeed = feedparser.parse(url,
                etag = getattr(feed, "etag", None),
                modified = getattr(feed, "modified", None))
            if newfeed.status == 304:
                self.cleanFeed(feed)
                cache[url] = (now + self.data.get('cache_timeout'), feed)
                return feed

        feed = feedparser.parse(url)
        self.cleanFeed(feed)
        cache[url] = (now + self.data.get('cache_timeout'), feed)

        return feed
def dl_rss(url, party='Democratic'):
    """Takes url to media/press release page and finds the rss feed there"""
    try:    
        if party == 'Democratic':
            links = get_prs_from_url(url, '//tr/td/a')
            for url in links:
                match = re.search(r'RSS_reader_Member\.asp\?Feed=(\d+)', url)
                if (match):
                    feed_id = match.group(1)
            rss_feed = feedparser.parse('http://www.pahouse.com/pr/xml/%s.xml' %(feed_id))
        if party == 'Republican':
            links = get_prs_from_url(url, '//div[@id="NewsRSS"]')
            try:
                rss_feed = feedparser.parse(links[0])
            except:
                rss_feed = feedparser.parse(links[1])
            # print rss_feed
        list_of_pr_dicts = []
        for entry in rss_feed['entries']:
            # print entry
            if entry['link'] == None:
                continue
            list_of_pr_dicts.append({"title":entry['title'], "datestamp":parse_dates(entry['published']), "url":entry['link']})
        return list_of_pr_dicts
    except:
        print "Could not get RSS Feed for %s.\nHere are the links:%s" %(url, links)
Example #8
0
def read(request):
    #get initial time from feed
    getInitialFeed = feedparser.parse(PROTO + USERNAME + ":" + PASSWORD + "@" + SERVER + PATH)
    lastModified = getInitialFeed.entries[0].modified
    while True:
        #keep checking for feed
        scrapedFeed = feedparser.parse(PROTO+USERNAME+":"+PASSWORD+"@"+SERVER+PATH)
        scrapedModified = scrapedFeed.entries[0].modified #get time when feed is being checked
        if lastModified < scrapedModified: #if there is a new message (timestamp is greater than the last time the feed was checked)
            lastModified = scrapedModified #update the last time a new message arrived
            name1 = scrapedFeed.entries[0].author_detail.name #get details
            email1 = scrapedFeed.entries[0].author_detail.email
            content = str(scrapedFeed.entries[0].title) 
            
            try:
                user = User.objects.get(email = email1) #try to get user who sent it from database
            except:
                x = find(name1,' ')+1 #if user does not exist, create user in database
                first = name1[:x]
                addUser(name1, email1, first)    
                user = User.objects.get(email = email1)
            
            time1 = str(scrapedModified) #parse into string so it can be sliced
            time2 = time1[:10]+' '+time1[11:19] #edit string into a time that can be parsed
            time3 = datetime.strptime(time2, '%Y-%m-%d %H:%M:%S') #parse string into a datetime object
            underscorename = convert(user.username,' ','_')
            addMessage(user, email1, content, time3, underscorename) #add new Message object to database
        
        current = str(time.strftime('%X')) #get current time
        today = date.today() #get day of week today
        dayofweek = today.isoweekday() #get day of week

        check(current, dayofweek)
            
    return HttpResponse()
Example #9
0
        def getfeed(cls,feedurl,settings):
                feeddata = None
                retval = Notification()
                log = logging.getLogger('FeedGateway')
                log.info("Getting feed for %s", feedurl)
                try:
			if settings and settings["http_proxy"] and settings["http_proxy"]<>"":
				prxy = urllib2.ProxyHandler({"http": settings["http_proxy"]})
                                log.debug("Proxy being used is : %s",prxy)
			else:
                                log.debug("No Proxy in use")
	                        prxy = None
			

                        #print "before calling parse"
			if prxy:			
	                        feeddata = feedparser.parse(feedurl, handlers = [prxy])	
			else:
				feeddata = feedparser.parse(feedurl)

			#log.debug(feeddata)
                        #print "after calling parse"
                except Exception,err:
                        log.error("Getfeed failed for %s with error: %s",feedurl,str(err))
                        raise
Example #10
0
    def expand_macro(self, formatter, name, args):
        outputText = ""
        #check arguments
        if (args == None or len(args) == 0):
            return None

        largs, kwargs = parse_args(args)

        if not kwargs.has_key('url'):
            return _usage()
        url = kwargs['url']

        if kwargs.has_key('details'):
            details=kwargs['details']
        else:
            details="false"

        if kwargs.has_key('proxy'):
            proxy=kwargs['proxy']
        else:
            proxy=None

        try:
            if proxy != None:
                proxyHandler = urllib2.ProxyHandler({"http":proxy})
                feedData = feedparser.parse(url, handlers = [proxyHandler])
            else:
                response = urlopen(Request(url))
                response.close()
                feedData = feedparser.parse(url)
            
        except HTTPError, e:
            outputText += "HTTP_ERROR("+str(e.code)+")"
Example #11
0
def view_rss_feed(request, rssID):
	# Get the rss by its ID
	qs = RssFeed.objects.filter(id=rssID)
	# If doesn't exist, or if too bad, return empty entries for error
	if not qs or qs[0].mark < 5:
		return render(request, "viewrss.html", {'entries': None})
	# if exist and accepted, get entries
	else:
		rss = qs[0]
		entries = FeedEntry.objects.filter(rssfeed=rss)
		# if entries doesn't exist, add all the entries
		if not entries:
			feed = feedparser.parse(rss.url)
			entries = feed['entries']
			for x in entries:
				x['published'] = parser.parse(x['published']).replace(tzinfo=None)
				entry = FeedEntry(rssfeed=rss, title=x['title'], date=x['published'], link=x['link'], summary=x['summary'])
				entry.save()
		# if entries already exist, check updated date of rss feed and add only news entries
		else:
			feed = feedparser.parse(rss.url)
			entries = feed['entries']
			for x in entries:
				x['published'] = parser.parse(x['published']).replace(tzinfo=None)
				if x['published'] > rss.updatedDate:
					entry = FeedEntry(rssfeed=rss, title=x['title'], date=x['published'], link=x['link'], summary=x['summary'])
					entry.save()
			# Update the rss update date
			rss.updatedDate = parser.parse(feed['feed']['updated']).replace(tzinfo=None)
			rss.save()
		return render(request, "viewrss.html", {'rss':rss, 'entries':entries})
Example #12
0
    def test_generate_utf8_encode_guid_bug(self):
        """Some feeds trigger utf8 bugs when the guid is generated."""
        feed_str = get_data_file("mobile_it.rss")
        feed = feedparser.parse(feed_str)
        for entry in feed["entries"]:
            guid = feedutil.get_entry_guid(feed, entry)
            self.assertTrue(guid.startswith("http://"))

        feed_str = get_data_file("no-steam.rss")
        feed = feedparser.parse(feed_str)
        for entry in feed["entries"]:
            guid = feedutil.get_entry_guid(feed, entry)
            self.assertTrue(guid)

        feed_str = get_data_file("fakultet.xml")
        feed = feedparser.parse(feed_str)
        for entry in feed["entries"]:
            guid = feedutil.get_entry_guid(feed, entry)
            self.assertTrue(guid)

        feed_str = get_data_file("poker_pl.rss")
        feed = feedparser.parse(feed_str)
        for entry in feed["entries"]:
            guid = feedutil.get_entry_guid(feed, entry)
            self.assertTrue(guid)
Example #13
0
def lookup(geo):
    """Look up articles for geo"""

    # Check cache
    try:
        if geo in lookup.cache:
            return lookup.cache[geo]
    except AttributeError:
        lookup.cache = {}

    # Replace special characters
    escaped = urllib.parse.quote(geo, safe="")

    # Get feed from Google
    feed = feedparser.parse(f"https://news.google.com/news/rss/local/section/geo/{escaped}")

    # If no items in feed, get feed from Onion
    if not feed["items"]:
        feed = feedparser.parse("http://www.theonion.com/feeds/rss")

    # Cache results
    lookup.cache[geo] = [{"link": item["link"], "title": item["title"]} for item in feed["items"]]

    # Return results
    return lookup.cache[geo]
Example #14
0
def craigslist_example():
    sf = feedparser.parse('http://sfbay.craigslist.org/eng/index.rss')
    ny = feedparser.parse('http://newyork.craigslist.org/eng/index.rss')
    sf_file = '/tmp/sf_top_words.csv'
    ny_file = '/tmp/ny_top_words.csv'

    get_top_words(sf, ny, sf_file, ny_file)
Example #15
0
def tech_blog_example():
    mashable = feedparser.parse('http://feeds.mashable.com/Mashable')
    tech_crunch = feedparser.parse('http://feeds.feedburner.com/TechCrunch/')
    mashable_file = '/tmp/mashable_top_words.csv'
    tech_crunch_file = '/tmp/tech_crunch_top_words.csv'

    get_top_words(mashable, tech_crunch, mashable_file, tech_crunch_file)
Example #16
0
 def test_304(self):
     # first retrieve the url
     u = 'http://localhost:8097/tests/http/http_status_304.xml'
     f = feedparser.parse(u)
     self.assertEqual(f.status, 200)
     self.assertEqual(f.entries[0].title, 'title 304')
     # extract the etag and last-modified headers
     e = [v for k, v in f.headers.items() if k.lower() == 'etag'][0]
     mh = [v for k, v in f.headers.items() if k.lower() == 'last-modified'][0]
     ms = f.updated
     mt = f.updated_parsed
     md = datetime.datetime(*mt[0:7])
     self.assertTrue(isinstance(mh, basestring))
     self.assertTrue(isinstance(ms, basestring))
     self.assertTrue(isinstance(mt, time.struct_time))
     self.assertTrue(isinstance(md, datetime.datetime))
     # test that sending back the etag results in a 304
     f = feedparser.parse(u, etag=e)
     self.assertEqual(f.status, 304)
     # test that sending back last-modified (string) results in a 304
     f = feedparser.parse(u, modified=ms)
     self.assertEqual(f.status, 304)
     # test that sending back last-modified (9-tuple) results in a 304
     f = feedparser.parse(u, modified=mt)
     self.assertEqual(f.status, 304)
     # test that sending back last-modified (datetime) results in a 304
     f = feedparser.parse(u, modified=md)
     self.assertEqual(f.status, 304)
Example #17
0
    def get_urls(self):
        """
        Returns a list of URLs to scrape
        """
        urls = []
        # NR comittees are LLP based
        if self.LLP:
            for i in self.LLP:
                roman_numeral = roman.toRoman(i)
                options = self.URLOPTIONS.copy()
                options['GP'] = roman_numeral
                options['NRBR'] = 'NR'
                url_options = urlencode(options)
                url_llp = "{}?{}".format(self.BASE_URL, url_options)
                rss = feedparser.parse(url_llp)

                print "GP {}: NR: {} Comittees".format(
                    roman_numeral, len(rss['entries']))
                urls = urls + [entry['link'] for entry in rss['entries']]

        # AKT = aktiv, AUF = aufgeloest
        for aktauf in ['AKT', 'AUF']:
            options['NRBR'] = 'BR'
            options['R_AKTAUF'] = aktauf
            url_options = urlencode(options)
            url_br = "{}?{}".format(self.BASE_URL, url_options)
            rss = feedparser.parse(url_br)

            print "BR {}: {} Comittees".format(
                aktauf, len(rss['entries']))
            urls = urls + [entry['link'] for entry in rss['entries']]

        return urls
Example #18
0
def fetch_and_parse_feed(url, etag=None, last_modified=None):
    # TODO implement etag & last_modified header
    url = sanitize_url(url)
    feed_parsed = feedparser.parse(url)
    if not hasattr(feed_parsed, 'status'):
        raise FetchingException("Connection error")
    elif feed_parsed.status not in (200, 301, 302):
        raise FetchingException("status_code is %d" % feed_parsed.status)
    if feed_parsed.version == '':
        # it's probably html instead of rss/atom
        resp = fetch_url(url)
        if resp.status_code not in (200, 301, 302):
            raise FetchingException("status_code is %d" % resp.status_code)
        soup = BeautifulSoup(resp.content)
        try:
            url = soup.find_all("link", rel="alternate")[0]['href']
        except (IndexError, KeyError):
            # alternate-link is missing
            raise FetchingException("Neither RSS nor good HTML...")
        if not url.startswith("http"):
            url = concat_urls(resp.url, url)
        feed_parsed = feedparser.parse(url)
        if feed_parsed.status not in (200, 301, 302):
            raise FetchingException("status_code is %d" % feed_parsed.status)
    if feed_parsed.status == 302:  # moved permanently
        logger.warning("/!\\ permanent redirect (302) for %s", url)
        url = feed_parsed.href
    elif feed_parsed.status == 301:
        logger.warning("/!\\ temporary redirect (301) for %s", url)
    return {"feed": feed_parsed, "real_url": url}
Example #19
0
def get_accidents():
    feed = feedparser.parse(RAIB_ATOM_URL)
    accidents = []
    page = 1
    while feed.entries:
        for entry in feed.entries:
            if not is_report_or_bulletin(entry):
                continue
            new_accident = accident.Accident(
                    'en', get_pdf_link(entry.content[0].value, feed.href),
                    # Strip "Press release: "
                    'gb', ': '.join(entry.title.split(": ")[1:]), "RAIB")
            # Location is too hard to parse for now
            new_accident.longdesc = get_longdesc(entry.content[0].value)
            # Company is not provided (usually)
            # Let's parse the date with regex!
            regex = \
                re.compile(
                ".* ([0-9]?[0-9](st|nd|rd|th)? [^ ]* [0-9][0-9][0-9][0-9]).*")
            matches = regex.match(entry.summary)
            if not matches is None:
                new_accident.date = dateutil.parser.parse(matches.group(1))
            new_accident.published = dateutil.parser.parse(entry.published)
            new_accident.alturls = {'landing': entry.link}
            accidents.append(new_accident)
        page += 1
        feed = feedparser.parse(RAIB_ATOM_URL + "&page={}".format(page))
    return accidents
Example #20
0
    def test_empty_guid(self, get):
        get.return_value = responses(304)

        parsed = feedparser.parse(test_file('no-guid.xml'))
        data = filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        )
        feed = FeedFactory.create()
        with self.assertNumQueries(5):
            store_entries(feed.url, data)
        self.assertTrue(feed.entries.get().guid)

        feed.entries.all().delete()

        parsed = feedparser.parse(test_file('no-link-guid.xml'))
        data = filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        )
        feed = FeedFactory.create()
        with self.assertNumQueries(5):
            store_entries(feed.url, data)
        self.assertTrue(feed.entries.get().guid)
Example #21
0
def albums(request, template_name="g_gallery/gallery.html"):
    """ 
    A page with public albums from Picasa
    """
    
    # An old-fashioned way to access albums data
    #gd_client = gdata.photos.service.PhotosService()
    #albums = gd_client.GetUserFeed(user=DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT)

    g_albums = []
    feed = feedparser.parse("https://picasaweb.google.com/data/feed/api/user/" + \
        DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT)
    for album in feed['entries']:
        g_alb = g_album(album.title, album.link, album.media_thumbnail[0]['url'], \
            album.gphoto_numphotos, album.gphoto_id, album.summary)
        g_albums.append(g_alb)

    g_comments = []
    comments = feedparser.parse("https://picasaweb.google.com/data/feed/api/user/" + \
        DJANGO_SETTINGS_MODULE.GOOGLE_ACCOUNT + "?kind=comment&max-results=20")
    for comment in comments['entries']:
        com = g_comment(comment.content[0].value, comment.author, \
            comment.links[0].href, comment.gphoto_thumbnail, comment.id, \
                comment.gphoto_photoid)
        g_comments.append(com)

    return render_to_response(template_name, {
        "g_albums": g_albums,
        "g_comments": g_comments,
    }, context_instance=RequestContext(request))
Example #22
0
    def on_feed_input(self, feed, config):
        config = self.build_config(config)

        log.debug('Requesting feed `%s` url `%s`' % (feed.name, config['url']))

        # check etags and last modified -headers
        # let's not, flexget works better when feed contains all entries all the time ?
        etag = None
        modified = None
        """
        etag = feed.cache.get('etag', None)
        if etag:
            log.debug('Sending etag %s for feed %s' % (etag, feed.name))
        modified = feed.cache.get('modified', None)
        if modified:
            log.debug('Sending last-modified %s for feed %s' % (etag, feed.name))
        """

        # set timeout to one minute
        orig_timout = socket.getdefaulttimeout()
        try:
            socket.setdefaulttimeout(60)

            # get the feed & parse
            if urllib2._opener:
                rss = feedparser.parse(config['url'], etag=etag, modified=modified, handlers=urllib2._opener.handlers)
            else:
                rss = feedparser.parse(config['url'], etag=etag, modified=modified)
        except LookupError, e:
            raise PluginError('Unable to parse the RSS: %s' % e)
Example #23
0
def preprocess_public_feed():
    """
    reads the public feed - http://chroniclingamerica.loc.gov/batches/feed/
    and returns a dictionary of {batch name: released datetime}
    """
    LOGGER.info("processing public feed for released datetime")
    feed = feedparser.parse("http://chroniclingamerica.loc.gov/batches/feed/")
    batch_release_times = {}

    if len(feed.entries) == 0:
        LOGGER.error(
            "public feed did not return any batches! Check to make sure chroniclingamerica.loc.gov is running correctly"
        )

    cont = True
    while cont:
        for entry in feed.entries:
            batch_name = re.match(r'info:lc/ndnp/batch/(.+)', entry.id).group(1)
            # convert time.struct from feedparser into a datetime for django
            released = datetime.fromtimestamp(mktime(entry.updated_parsed))
            batch_release_times[batch_name] = released
            # if the batch starts with batch_ remove it, so that it works regardless of that prefix
            if batch_name.startswith("batch_"):
                batch_release_times[batch_name[6:]] = released

        next_page = get_next_page(feed)
        if next_page:
            feed = feedparser.parse(next_page)
        else:
            cont = False
    return batch_release_times
Example #24
0
    def clean_fields(self, exclude=None):
        super(Feed, self).clean_fields(exclude)
        errors = {}
        parsed_feed = feedparser.parse(self.url)

        # parsed feed is an invalid feed
        # TODO add more robust error handling
        if (parsed_feed.bozo and not parsed_feed.entries):
            # try finding a feed at the site
            feeds = find_feeds(self.url)

            if (feeds):
                self.url = feeds[0]
                parsed_feed = feedparser.parse(self.url)
            else:
                urlname = self._meta.get_field('url').name
                message = "Unable to find a feed at '{0}'".format(self.url)
                errors[urlname] = ([message])
                raise ValidationError(errors)

        # if no title then use the feed's title
        if not self.title:
            self.title = parsed_feed.feed.title

        # set the source of the feed
        if parsed_feed.feed.title:
            self.source = urlparse.urlparse(parsed_feed.feed.link).hostname
        else:
            self.source = urlparse.urlparse(self.url).hostname
Example #25
0
    def test_details(self):
        [lp] = self.english_1.license_pools

        lp.suppressed = False
        with self.app.test_request_context("/"):
            response = self.manager.admin_work_controller.details(lp.data_source.name, lp.identifier.type, lp.identifier.identifier)
            eq_(200, response.status_code)
            feed = feedparser.parse(response.get_data())
            [entry] = feed['entries']
            suppress_links = [x['href'] for x in entry['links']
                              if x['rel'] == "http://librarysimplified.org/terms/rel/hide"]
            unsuppress_links = [x['href'] for x in entry['links']
                                if x['rel'] == "http://librarysimplified.org/terms/rel/restore"]
            eq_(0, len(unsuppress_links))
            eq_(1, len(suppress_links))
            assert lp.identifier.identifier in suppress_links[0]

        lp.suppressed = True
        with self.app.test_request_context("/"):
            response = self.manager.admin_work_controller.details(lp.data_source.name, lp.identifier.type, lp.identifier.identifier)
            eq_(200, response.status_code)
            feed = feedparser.parse(response.get_data())
            [entry] = feed['entries']
            suppress_links = [x['href'] for x in entry['links']
                              if x['rel'] == "http://librarysimplified.org/terms/rel/hide"]
            unsuppress_links = [x['href'] for x in entry['links']
                                if x['rel'] == "http://librarysimplified.org/terms/rel/restore"]
            eq_(0, len(suppress_links))
            eq_(1, len(unsuppress_links))
            assert lp.identifier.identifier in unsuppress_links[0]
Example #26
0
def get_feed(url):
    try:
        feed = feedparser.parse(url)

        if feed.bozo:
            exc = feed.bozo_exception
            print ("Error detected on %s:" % url)
            print exc

        if feed.has_key("status"):
            if feed.status == 302:
                feed = feedparser.parse(feed.href)
            elif feed.status == 200:
                pass
            else:
                print ("%s returned http status code %s" % (url, feed.status))

    except AttributeError as e:
        print ("skipping", url, "- error:", e.args[0])

    except KeyError as e:
        print ("skipping", url, "- error:", e.args[0])

    except KeyboardInterrupt:
        print ("^C recognized - stopping now")
        exit(1)

    return feed
Example #27
0
  def fetch(self, force=False):
    etag = self.feed.etag
    modified = self.feed.last_modified
    address = self.feed.feed_address
    if force:
      etag = None
      modified = None

    USER_AGENT = 'NewsBlur Feed Fetcher - %s subscriber%s - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.2.3 (KHTML, like Gecko) Version/5.2)' % (
            self.feed.num_subscribers,
            's' if self.feed.num_subscribers > 1 else '',
            settings.READER4YOU_URL
        )
    try:
      fpfeed = feedparser.parse(address,
                                agent=USER_AGENT,
                                etag=etag,
                                modified=modified)
    except (TypeError, ValueError), e:
      feedparser.PARSE_MICROFORMATS = False
      fpfeed = feedparser.parse(address,
                                agent=USER_AGENT,
                                etag=etag,
                                modified=modified)
      feedparser.PARSE_MICROFORMATS = True
Example #28
0
    def get_feed(self, url):
        """Given url might be point to http document or to actual feed. In case
        of http document, we try to find first feed auto discovery url.
        """
        stripped = url.strip()

        try:
            resp = requests.get(stripped)
        except Exception as err:
            return {'success': False, 'output': str(err)}

        feed = feedparser.parse(resp.text)
        if feed.version != '':
            return {'success': True, 'output': (feed, stripped)}

        urls = FeedFinder.parse(resp.text)
        feed_url = ''
        if len(urls) > 0:
            # Each url is tuple where href is first element.
            # NOTE : Sites might have several feeds available and we are just
            # naively picking first one found.
            feed_url = urls[0][0]
            if urlparse(feed_url)[1] == '':
                # We have empty 'netloc', meaning we have relative url
                feed_url = urljoin(stripped, feed_url)
        return {'success': True, 'output': (feedparser.parse(feed_url), feed_url)}
Example #29
0
def get_latest_links(lang_name):
    import feedparser

    links = []
    i = 1
    if lang_name == "az":
        feed = feedparser.parse("http://milli.az/rss.php")
        for e in feed.entries:
            if "qtisadiyyat" in e.category:
                links.append((e.title, e.link))
                if i != 5:
                    i = i + 1
                else:
                    break
    if lang_name == "en":
        feed = feedparser.parse("http://today.az/rss.php")
        for e in feed.entries:
            if "Business" in e.category:
                links.append((e.title, e.link))
                if i != 5:
                    i = i + 1
                else:
                    break
    else:
        feed = feedparser.parse("http://1news.az/rss.php?sec_id=21")
        for e in feed.entries[:4]:
            links.append((e.title, e.link))
    return links
Example #30
0
    def test_same_guids(self, get):
        get.return_value = responses(304)
        feed = FeedFactory.create()

        parsed = feedparser.parse(test_file('aldaily-06-27.xml'))
        data = filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        )

        with self.assertNumQueries(5):
            store_entries(feed.url, data)
        self.assertEqual(feed.entries.count(), 4)

        data = filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        )
        with self.assertNumQueries(2):
            store_entries(feed.url, data)
        self.assertEqual(feed.entries.count(), 4)

        parsed = feedparser.parse(test_file('aldaily-06-30.xml'))
        data = filter(
            None,
            [UniqueFeed.objects.entry_data(
                entry, parsed) for entry in parsed.entries]
        )

        with self.assertNumQueries(5):
            store_entries(feed.url, data)
        self.assertEqual(feed.entries.count(), 10)
Example #31
0
import RPi.GPIO as GPIO, feedparser, time, os.path

USERNAME = "******"
PASSWORD = "******"

GPIO.setwarnings(False)
GPIO.setmode(GPIO.BOARD)
LIGHTS = 18
GPIO.setup(7, GPIO.OUT)

cur_mails = int(
    feedparser.parse("https://" + USERNAME + ":" + PASSWORD +
                     "@mail.google.com/gmail/feed/atom")["feed"]["fullcount"])

print("You have " + str(cur_mails) + " emails in your inbox.")

if os.path.isfile("emails.txt") == False:  #create the file if it doesnt exist
    f = open('emails.txt', 'w')
    f.write('1')
    #The interpreter doesn't like reading from an empty file
    f.close

f = open('emails.txt', 'r')
last_mails = int(f.read())
f.close()

print("Last known number of emails is " + str(last_mails))

if cur_mails < last_mails:
    last_mails = cur_mails
    f = open('emails.txt', 'w')
Example #32
0
         try:
             data = open(feed_link, 'r').read()
         except IOError:
             self.feed.feed_link = 'http://' + feed_link
             self.fetch_page(urllib_fallback=True)
             return
     if data:
         html = self.rewrite_page(data)
         self.save_page(html)
     else:
         self.save_no_page()
         return
 except (ValueError, urllib2.URLError, httplib.BadStatusLine,
         httplib.InvalidURL, requests.exceptions.ConnectionError), e:
     self.feed.save_page_history(401, "Bad URL", e)
     fp = feedparser.parse(self.feed.feed_address)
     feed_link = fp.feed.get('link', "")
     self.feed.save()
 except (urllib2.HTTPError), e:
     self.feed.save_page_history(e.code, e.msg, e.fp.read())
 except (httplib.IncompleteRead), e:
     self.feed.save_page_history(500, "IncompleteRead", e)
 except (requests.exceptions.RequestException,
         requests.packages.urllib3.exceptions.HTTPError), e:
     logging.debug(
         '   ***> [%-30s] Page fetch failed using requests: %s' %
         (self.feed, e))
     # mail_feed_error_to_admin(self.feed, e, local_vars=locals())
     return self.fetch_page(urllib_fallback=True, requests_exception=e)
 except Exception, e:
     logging.debug('[%d] ! -------------------------' %
Example #33
0
def get_news(publication):
    feed = feedparser.parse(RSS_Feed[publication.lower()])
    return feed['entries']
Example #34
0
def download_rss(url):
    return feedparser.parse(url)
Example #35
0
import csv, feedparser, time
from datetime import date
from storefront.models import BlogPostInfo

blog_posts = BlogPostInfo.objects.all()
feed = feedparser.parse('http://blog.indextank.com/feed/')

for item in feed['items']:
    if not any(item['link'] == post.url for post in blog_posts):
        # if there isn't a post with this url, then create it
        d = date(item['date_parsed'][0], item['date_parsed'][1],
                 item['date_parsed'][2])
        BlogPostInfo.objects.create(url=item['link'].encode('utf-8'),
                                    title=item['title'].encode('utf-8'),
                                    author=item['author'].encode('utf-8'),
                                    date=d)
Example #36
0
]

# Date and time setup. I want only posts from "today,"
# where the day lasts until 2 AM.
utc = pytz.utc
homeTZ = pytz.timezone('US/Central')
dt = datetime.now(homeTZ)
if dt.hour < 2:
    dt = dt - timedelta(hours=24)
start = dt.replace(hour=0, minute=0, second=0, microsecond=0)
start = start.astimezone(utc)

# Collect all of today's posts and put them in a list of tuples.
posts = []
for s in subscriptions:
    f = fp.parse(s)
    try:
        blog = f['feed']['title']
    except KeyError:
        continue
    for e in f['entries']:
        try:
            when = e['updated_parsed']
        except KeyError:
            when = e['published_parsed']
        when = utc.localize(datetime.fromtimestamp(time.mktime(when)))
        if when > start:
            title = e['title']
            try:
                body = e['content'][0]['value']
            except KeyError:
Example #37
0
import feedparser

rss_url = "http://news.google.com/?output=rss"
feed = feedparser.parse(rss_url)
count = len(feed['entries'])
for i in range(0, count):
    if (i >= 9):
        break
    print '{0: <83} {1}...'.format(' ',
                                   feed.entries[i].title[0:40].encode('utf8'))
Example #38
0
def get_random():
    response = feedparser.parse("https://news.ycombinator.com/rss")
    entries = response["entries"]
    number_of_articles = len(entries)
    article_selected = random.randrange(number_of_articles)
    return entries[article_selected]
Example #39
0
    def onHeartbeat(self):
        now = datetime.now()
        rss = ""
        feed = ""
        FeedValueFTd = ""
        FeedValueFTm = ""

        if now >= self.nextpoll:
            self.nextpoll = now + timedelta(seconds=self.pollinterval)
            rss = Parameters["Mode1"]
            feed = feedparser.parse(rss)
            for key in feed["entries"]:
                FeedValue = str(key["description"])
                FeedValue = '<tr>TODAY ' + FeedValue.split('Today')[1]

                FeedValueFTd = FeedValue.split('Tomorrow')[0]
                FeedValueFTm = FeedValue.split('Tomorrow')[1]
                Domoticz.Log("Gathering Data for:" + str(key["title"]))

                if not (FeedValueFTd.strip().find('wflag-l1') == -1):
                    Domoticz.Debug(
                        "Alarm(s) for Today: No special awareness required.")
                    #Domoticz.Log("Data Of Warning:"+str(FeedValueFTd.strip()))
                    Domoticz.Debug("Type Of Warning:" +
                                   str(FeedValueFTd.find('wflag-l1-t5.jpg')))
                    Domoticz.Debug("Data:" +
                                   str(FeedValueFTd).replace('<', '-'))
                    ValueToUpdate = "No special awareness required"
                    #Devices[1].Update(1,ValueToUpdate, Image=Images[icon].ID)

                    Domoticz.Debug("Current Awareness Status:" +
                                   Devices[1].sValue + " with Level " +
                                   str(Devices[1].nValue))
                    if (ValueToUpdate != Devices[1].sValue):
                        Domoticz.Log("Awareness for Today Updated to:" +
                                     ValueToUpdate)
                        Devices[1].Update(1, ValueToUpdate)
                    else:
                        Domoticz.Log("Awareness Remains Unchanged for Today.")
                else:
                    Domoticz.Debug(
                        "------FEEDPARSER OUTPUT for TODAY:------------------")
                    #Domoticz.Log("Type Of Warning:"+str(FeedValueFTd.find('wflag-l1-t5.jpg')))
                    #Domoticz.Log("Data:"+str(FeedValueFTd).replace('<br>','').replace('</br>','').replace('<td>','').replace('</td>','').replace('<tr>','').replace('</tr>','').replace('<b>','').replace('</b>','').replace('<i>','').replace('</i>','').replace('<',''))
                    FeedValueFTdPeriod = FeedValueFTd.split('<td>')[0]
                    FeedValueFTdPeriod = FeedValueFTdPeriod.split('alt="')[1]
                    FeedValueFTdPeriod = FeedValueFTdPeriod.split(':')

                    Domoticz.Debug(
                        "Icon:" +
                        FeedValueFTd.split('<td>')[0].replace('<', '-'))
                    AWTPossitions = FeedValueFTd.replace('<',
                                                         '-').split('awt:')
                    #if AWTPossitions[2]: Domoticz.Log("AWT Possitions 2:"+AWTPossitions[2])
                    WarningText = ""
                    for AWTPos in range(1, len(AWTPossitions)):
                        AWTvalue = ""
                        LEVELvalue = ""
                        AWTvalue = AWTPossitions[AWTPos].split(
                            'level')[0].strip()
                        Domoticz.Debug("AWT Possitions Value " + str(AWTPos) +
                                       ":" + AWTvalue)
                        #LEVELvalue = AWTPossitions[AWTPos].split('level:')[1].split('border')[0].replace('"','').strip()
                        LEVELvalue = AWTPossitions[AWTPos].split(
                            'level:')[1].split('"')[0]
                        Domoticz.Debug("Level Possitions Value " +
                                       str(AWTPos) + ":" + LEVELvalue)
                        AWTtext = AWTvalue
                        if (AWTvalue == "1"): AWTtext = "Wind"
                        if (AWTvalue == "2"): AWTtext = "Snow/Ice"
                        if (AWTvalue == "3"): AWTtext = "ThunderStorm"
                        if (AWTvalue == "4"): AWTtext = "Fog"
                        if (AWTvalue == "5"): AWTtext = "High Temp"
                        if (AWTvalue == "6"): AWTtext = "Low Temp"
                        if (AWTvalue == "7"): AWTtext = "Coastal Event"
                        if (AWTvalue == "8"): AWTtext = "Forestfire"
                        if (AWTvalue == "9"): AWTtext = "Avalanches"
                        if (AWTvalue == "10"): AWTtext = "Rain"
                        if (AWTvalue == "11"): AWTtext = "Flood"
                        if (AWTvalue == "12"): AWTtext = "Rain-Flood"
                        if (AWTPos > 1): WarningText = WarningText + ", "
                        WarningText = WarningText + AWTtext + "(" + LEVELvalue + ")"
                        Domoticz.Debug("Alarm(s) for today:" +
                                       str(WarningText))
                    Domoticz.Debug("AWT:" + FeedValueFTdPeriod[1].split(' ')
                                   [0].replace('<', '-').replace('>', '-'))
                    Domoticz.Debug("Level:" + FeedValueFTdPeriod[2].split('"')
                                   [0].strip().replace('<', '-'))
                    Domoticz.Debug(
                        "Period:" + FeedValueFTd.split('<td>')[1].strip(
                        ).replace('<br>', '').replace('</br>', '').replace(
                            '<td>', '').replace('</td>', '').replace('<', '-'))
                    #Domoticz.Log("MessageLocal:"+FeedValueFTd.split('<td>')[2].split('.')[0].strip())
                    #Domoticz.Log("MessageEn:"+FeedValueFTd.split('<td>')[2].split('.')[1].strip().replace('<','-'))
                    #Domoticz.Log("MessageEn:"+FeedValueFTd.split('<td>')[2].split('.')[1].split('english:')[1].strip())
                    #ValueToUpdate=FeedValueFTd.split('<td>')[2].split('.')[1].split('english:')[1].strip()
                    if (LEVELvalue == "5"): LEVELvalue = "1"

                    Domoticz.Debug("Current Awareness Status:" +
                                   Devices[1].sValue + " with Level " +
                                   str(Devices[1].nValue))
                    if (WarningText != Devices[1].sValue) or (
                            int(LEVELvalue) != Devices[1].nValue):
                        Domoticz.Log("Awareness for Today Updated to:" +
                                     WarningText)
                        Devices[1].Update(int(LEVELvalue), WarningText)
                    else:
                        Domoticz.Log("Awareness Remains Unchanged for Today.")

                if not (FeedValueFTm.strip().find('wflag-l1') == -1):
                    Domoticz.Debug(
                        "Alarm(s) for Tomorrow: No special awareness required")
                    #Domoticz.Log("Data Of Warning:"+str(FeedValueFTm.strip()))
                    Domoticz.Debug("Type Of Warning:" +
                                   str(FeedValueFTm.find('wflag-l1-t5.jpg')))
                    ValueToUpdate = "No special awareness required"
                    Domoticz.Debug("Current Awareness Status:" +
                                   Devices[2].sValue + " with Level " +
                                   str(Devices[2].nValue))
                    if (ValueToUpdate != Devices[2].sValue):
                        Domoticz.Log("Awareness for Tomorrow Updated to:" +
                                     ValueToUpdate)
                        Devices[2].Update(1, ValueToUpdate)
                    else:
                        Domoticz.Log(
                            "Awareness Remains Unchanged for Tomorrow.")
                else:
                    #FeedValueFTm = FeedValueFTd.split('<tr>')
                    Domoticz.Debug(
                        "------FEEDPARSER OUTPUT for TOMORROW:------------------"
                    )
                    #Domoticz.Log("Type Of Warning:"+str(FeedValueFTm.find('awt:5')))
                    FeedValueFTmPeriod = FeedValueFTm.split('<td>')[0]
                    FeedValueFTmPeriod = FeedValueFTmPeriod.split('alt="')[1]
                    FeedValueFTmPeriod = FeedValueFTmPeriod.split(':')

                    Domoticz.Debug(
                        "Icon:" +
                        FeedValueFTm.split('<td>')[0].replace('<', '-'))
                    AWTPossitions = FeedValueFTm.replace('<',
                                                         '-').split('awt:')
                    #if AWTPossitions[2]: Domoticz.Log("AWT Possitions 2:"+AWTPossitions[2])
                    WarningText = ""
                    HLEVELvalue = 1
                    for AWTPos in range(1, len(AWTPossitions)):
                        AWTvalue = ""
                        LEVELvalue = ""
                        AWTvalue = AWTPossitions[AWTPos].split(
                            'level')[0].strip()
                        Domoticz.Debug("AWT Possitions Value " + str(AWTPos) +
                                       ":" + AWTvalue)
                        #LEVELvalue = AWTPossitions[AWTPos].split('level:')[1].split('border')[0].replace('"','').strip()
                        LEVELvalue = AWTPossitions[AWTPos].split(
                            'level:')[1].split('"')[0]
                        Domoticz.Debug("Level Possitions Value " +
                                       str(AWTPos) + ":" + LEVELvalue)
                        AWTtext = AWTvalue
                        if (AWTvalue == "1"): AWTtext = "Wind"
                        if (AWTvalue == "2"): AWTtext = "Snow/Ice"
                        if (AWTvalue == "3"): AWTtext = "ThunderStorm"
                        if (AWTvalue == "4"): AWTtext = "Fog"
                        if (AWTvalue == "5"): AWTtext = "High Temp"
                        if (AWTvalue == "6"): AWTtext = "Low Temp"
                        if (AWTvalue == "7"): AWTtext = "Coastal Event"
                        if (AWTvalue == "8"): AWTtext = "Forestfire"
                        if (AWTvalue == "9"): AWTtext = "Avalanches"
                        if (AWTvalue == "10"): AWTtext = "Rain"
                        if (AWTvalue == "11"): AWTtext = "Flood"
                        if (AWTvalue == "12"): AWTtext = "Rain-Flood"
                        WarningText = WarningText + AWTtext + "(" + LEVELvalue + ")"
                        if (AWTPos > 1): WarningText = WarningText + ", "
                        Domoticz.Debug("Alarm(s) for Tomorrow:" +
                                       str(WarningText))
                        if (int(LEVELvalue) > HLEVELvalue):
                            HLEVELvalue = int(LEVELvalue)

                    Domoticz.Debug(
                        "Icon:" +
                        FeedValueFTm.split('<td>')[0].replace('<', '-'))
                    Domoticz.Debug("AWT:" + FeedValueFTmPeriod[1].split(' ')
                                   [0].strip().replace('<', '-'))
                    Domoticz.Debug("Level:" + FeedValueFTmPeriod[2].split('"')
                                   [0].strip().replace('<', '-'))
                    #Domoticz.Log("Period:"+FeedValueFTm.split('<td>')[1].strip().replace('<','-'))
                    #Domoticz.Log("MessageLocal:"+FeedValueFTm.split('<td>')[2].split('.')[0].strip().replace('<','-'))
                    #Domoticz.Log("MessageEn:"+FeedValueFTm.split('<td>')[2].split('.')[1].split('english:')[1].strip().replace('<','-'))
                    #Domoticz.Log(FeedValueFTm)
                    #ValueToUpdate=FeedValueFTm.split('<td>')[2].split('.')[1].split('english:')[1].strip().replace('<','-')
                    if (HLEVELvalue == 5): HLEVELvalue = 0

                    Domoticz.Debug("Current Awareness Status:" +
                                   Devices[2].sValue + " with Level " +
                                   str(Devices[2].nValue))
                    if (WarningText != Devices[2].sValue) or (
                            int(HLEVELvalue) != Devices[2].nValue):
                        Domoticz.Log("Awareness for Tomorrow Updated to:" +
                                     WarningText)
                        Devices[2].Update(HLEVELvalue, WarningText)
                    else:
                        Domoticz.Log(
                            "Awareness Remains Unchanged for Tomorrow.")

                    Domoticz.Debug(
                        "----------------------------------------------------")
Example #40
0
outfile = codecs.open(outputfile, 'a', 'utf-8')

allWords = []

with open(inputfile) as infile:

    for account in infile:
        account = account.strip()
        time.sleep(1)

        print "Getting post titles for user: "******"http://%s" % urllib.quote(account)
        url += ".livejournal.com/data/rss"

        try:
            feed = feedparser.parse(
                url, agent='Web scraper by [email protected] @LawrenceA_UK')

        except:
            print "Error getting user: " + account
            continue

        # Get titles of last five posts

        for i in range(0, 4):

            try:
                # Get post title
                titleText = feed.entries[i].title

                # Get list of words from post title
                titleWords = re.findall(r'[\w]+', titleText, re.U)
Example #41
0
#Author - Samaksh Yadav
#Description - The following scraper scrapes rencent content available at TECHCRUNCH.COM
#Version 1.0

import feedparser

from urllib2 import urlopen
from bs4 import BeautifulSoup

rss = {'http://feeds.feedburner.com/TechCrunch/'}

for key in rss:
    print(key)
    d = feedparser.parse(key)
    for post in d.entries:
        try:
            html = urlopen(post.link)
            bsObj = BeautifulSoup(html, "html.parser")
            str1 = str(bsObj.find("div", attrs={"class":
                                                "article-entry text"}))
            str2 = str(
                bsObj.find("div",
                           attrs={"class": "aside aside-related-articles"}))
            str3 = bsObj.findAll("script")
            cleantext = bsObj.find("div",
                                   attrs={
                                       "class": "article-entry text"
                                   }).get_text()
            date = bsObj.find("meta",
                              attrs={
                                  "class": "swiftype",
Example #42
0
import feedparser

newsfeed = feedparser.parse(
    "https://timesofindia.indiatimes.com/rssfeedstopstories.cms")
print("number of rss", len(newsfeed.entries))
entry = newsfeed.entries[1]
print(entry.title)
print(entry.published)
print(entry.summary)
print(entry.link)
# https://gist.github.com/lukf/9785293

# -*- coding: utf-8 -*-
import feedparser, webbrowser, urllib, console, sys, datetime, urllib2, time
console.clear()
selected = "no"
feedURL = "http://feeds.pinboard.in/rss/secret:YOURSECRET/u:USERNAME/"  # RSS feed URL
outp = "Bookmarks from today \n|Time|Page|Tags|\n|:---|:---|:---|\n"  # header
dayone_footer = "#bookmark"  # Gets appended to the entry
now = datetime.datetime.now()
for post in feedparser.parse(
        urllib2.urlopen(feedURL).read().replace('dc:subject>',
                                                'subject>')).entries:
    postDate = datetime.datetime.strptime(
        post.date[:-6], '%Y-%m-%dT%H:%M:%S') + datetime.timedelta(
            seconds=-time.timezone, hours=1)  # 2014-04-10T15:00:01+00:00
    timediff = now - postDate
    if timediff < datetime.timedelta(days=1):
        # add to outp
        subject = ""
        try:
            subject = "#" + post.subject.replace(' ', ' #')
        except:
            pass
        outp = outp + "|" + (datetime.datetime.strftime(
            postDate, '%H:%M')) + "|" + "[" + post.title.replace(
                '[priv] ', '').replace(
                    '|', '–') + "](" + post.link + ")|" + subject + "|\n"

dayone_entry = outp + dayone_footer
# User confirmation
def Parse(rsslink):
    return feedparser.parse(rsslink)
Example #45
0
    trainMat = []; trainClasses = []
    for docIndex in trainningSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(trainMat, trainClasses)
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1

    print "the error rate is:" , float(errorCount)/ len(testSet)
    return vocabList, p0V, p1V

import feedparser
ny = feedparser.parse("http://newyork.craigslist.org/res/index.rss")
sf = feedparser.parse("http://sfbay.craigslist.org/apa/index.rss")


def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
    for item in sortedSF:
        print item[0]
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
Example #46
0
        message('writing to {}'.format(filename))
        with open(filename, 'wb') as outfile:
            outfile.write(data)
        q.task_done()


# Set up some threads to fetch the enclosures.
for i in range(num_fetch_threads):
    worker = threading.Thread(
        target=download_enclosures,
        args=(enclosure_queue, ),
        name='worker-{}'.format(i),
    )
    worker.setDaemon(True)
    worker.start()

# Download the feed(s) and put the enclosure URLs into
# the queue.
for url in feed_urls:
    response = feedparser.parse(url, agent='py_02_queueFetchPodcasts.py')
    for entry in response['entries'][:5]:
        for enclosure in entry.get('enclosures', []):
            parsed_url = urlparse(enclosure['url'])
            message('queuing {}'.format(parsed_url.path.rpartition('/')[-1]))
            enclosure_queue.put(enclosure['url'])

# Now wait for the queue to be empty, indicating that we have
# processed all of the downloads.
message('*** main thread waiting')
enclosure_queue.join()
message('*** done')
Example #47
0
import feedparser
import csv

purl = "https://gdata.youtube.com/feeds/api/playlists/"

f = open('youtube_failed.csv', 'a')
w = csv.writer(f)
playlistid = raw_input("Enter the playlist id : ")
prss = feedparser.parse(purl + str(playlistid))
for i in range(prss.entries.__len__()):
    try:
        vrss = feedparser.parse(str(prss.entries[i].id))
    except:
        print(prss.entries[i].id, ' not parsed')

    #out = str.join(str(plid,vrss.entries[0].title,vrss.entries[0].description))
    try:
        #print(playlist[plid],vrss.entries[0].title,vrss.entries[0].description)
        w.writerow([
            prss.feed.title.encode('utf-8'),
            vrss.entries[0].title.encode('utf-8'),
            vrss.entries[0].link.encode('utf-8'),
            vrss.entries[0].description.encode('utf-8')
        ])
    except:
        print("Error detected")
f.close()
Example #48
0
        print e
        print 'starting from an empty database'
        db = {}

    # -----------------------------------------------------------------------------
    # main loop where we fetch the new results
    print 'database has %d entries at start' % (len(db), )
    num_added_total = 0
    for i in range(args.start_index, args.max_index,
                   args.results_per_iteration):

        print "Results %i - %i" % (i, i + args.results_per_iteration)
        query = 'search_query=%s&sortBy=lastUpdatedDate&start=%i&max_results=%i' % (
            args.search_query, i, args.results_per_iteration)
        response = urllib.urlopen(base_url + query).read()
        parse = feedparser.parse(response)
        num_added = 0
        num_skipped = 0
        for e in parse.entries:

            j = encode_feedparser_dict(e)

            # extract just the raw arxiv id and version for this paper
            rawid, version = parse_arxiv_url(j['id'])
            j['_rawid'] = rawid
            j['_version'] = version

            # add to our database if we didn't have it before, or if this is a new version
            if not rawid in db or j['_version'] > db[rawid]['_version']:
                db[rawid] = j
                print 'updated %s added %s' % (j['updated'], j['title'])
Example #49
0
 def __init__(self, ticker):
     self.data = feedparser.parse(
         f'http://articlefeeds.nasdaq.com/nasdaq/symbols?symbol={ticker}')
Example #50
0
	def FetchURL(url):
		feed = feedparser.parse(url)
		feed_items = feed['items']
Example #51
0
def rss():
    url = request.form['rss_url']
    dictionary = feedparser.parse(url)
    import json
    return json.dumps(dictionary.entries)
Example #52
0
    def __init__(self, loop_counter):
        # 1回目のループで insert
        google_url = 'http://feeds.feedburner.com/GDBcode'
        dena_url = 'https://engineer.dena.com/index.xml'
        merukari_url = 'https://tech.mercari.com/feed'
        sakura_url = 'https://knowledge.sakura.ad.jp/rss/'  #  published ---> updated あとで処理
        smatrcamp_url = 'https://tech.smartcamp.co.jp/rss'
        salesforce_url = 'https://developer.salesforce.com/jpblogs/feed/'
        asana_url = 'https://blog.asana.com/category/eng/feed/'
        insta_url = 'https://instagram-engineering.com/feed'
        indeed_url = 'https://engineering.indeedblog.com/blog/feed/'
        dropbox_url = 'https://dropbox.tech/feed'
        uber_url = 'https://eng.uber.com/feed/'
        spotify_url = 'https://labs.spotify.com/feed/'

        xml_urls = [
            google_url, dena_url, merukari_url, sakura_url, smatrcamp_url,
            salesforce_url, asana_url, insta_url, indeed_url, dropbox_url,
            uber_url, spotify_url
        ]
        xml_titles = [
            'Google', 'Dena', 'Merukari', 'Sakura', 'Smatrcamp', 'SalesForce',
            'Asana', 'Insta', 'indeed', 'DropBox', 'Uber', 'Spotify'
        ]
        #xml_urls =[google_url,dena_url,merukari_url,sakura_url]

        if loop_counter == 0:
            con = psycopg2.connect(host="localhost",
                                   database="feed",
                                   user="******",
                                   password="******")
            cur = con.cursor()

            item_id = 0
            for x, t in zip(xml_urls, xml_titles):
                loop_count = 0
                while loop_count < 3:
                    feed_url = feedparser.parse(x)
                    media_title = t
                    print(media_title)
                    post_title = feed_url['entries'][loop_count]['title']
                    post_url = feed_url['entries'][loop_count]['link']
                    if 'published' in feed_url['entries'][loop_count].keys():
                        post_date = feed_url['entries'][loop_count][
                            'published']
                    elif 'updated' in feed_url['entries'][loop_count].keys():
                        post_date = feed_url['entries'][loop_count]['updated']

                    postgres_insert_query = """ INSERT INTO feed (id, media, title, url, date) VALUES (%s,%s,%s,%s,%s)"""
                    record_to_insert = (item_id, media_title, post_title,
                                        post_url, post_date)
                    cur.execute(postgres_insert_query, record_to_insert)
                    con.commit()

                    loop_count += 1
                    item_id += 1
            # XML クロール終了

            cur.execute("select id,media,title,url,date from feed")
            for r in cur:
                print(r)

        else:
            # 2回目のループで update
            con = psycopg2.connect(host="localhost",
                                   database="feed",
                                   user="******",
                                   password="******")
            cur = con.cursor()

            item_id = 0
            for x, t in zip(xml_urls, xml_titles):
                loop_count = 0
                while loop_count < 3:
                    feed_url = feedparser.parse(x)
                    media_title = t
                    print(media_title)
                    post_title = feed_url['entries'][loop_count]['title']
                    post_url = feed_url['entries'][loop_count]['link']
                    if 'published' in feed_url['entries'][loop_count].keys():
                        post_date = feed_url['entries'][loop_count][
                            'published']
                    elif 'updated' in feed_url['entries'][loop_count].keys():
                        post_date = feed_url['entries'][loop_count]['updated']

                    sql_update_query = """Update feed set title=%s, url=%s, date=%s where id=%s"""
                    cur.execute(
                        sql_update_query,
                        (post_title, post_url, post_date, str(item_id)))
                    con.commit()

                    loop_count += 1
                    item_id += 1

            #cur.execute("select id,media,title,url,date from feed")
            #for r in cur:
            #    print(r)

        cur.close()
        con.close()
Example #53
0
    link_container = soup.findAll("div", {"class": "blog-details clear"})
    #print('linkcontainer ====>', link_container)
    links = list(
        map(lambda row: row.find_all("pre", class_="links"), link_container))
    flat_list = [item for sublist in links for item in sublist]
    #print('flatlist ========>', flat_list)
    filteredLinks = list(
        filter(lambda entry: hosterName in entry.string, flat_list))
    if (len(filteredLinks) > 0):
        return filteredLinks[0]
    return None


if __name__ == '__main__':
    d = feedparser.parse('http://rmz.cr/feed')
    download_folder = config.get('downloadfolder')
    quality = config.get('quality')
    shows = config.get('shows')
    hosterShort = config.get('hosterShort')
    hosterName = config.get('hosterName')
    # Iterate through the entries and fetch the title and link, which is the relevant data
    print('###################start################### ' +
          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    prefiltered_values = list(
        filter(lambda x: hosterShort in x.title and quality in x.title,
               d['entries']))

    for entry in prefiltered_values:
        raw_title = entry['title']
Example #54
0
                [["http://rss.lemonde.fr/c/205/f/3054/index.rss"],["le monde"]],
                [["http://fr.news.yahoo.com/rss/france"],["yahoo"]],
                [["http://fr.news.yahoo.com/rss/societe"],["yahoo"]],
                [["http://www.tf1.fr/xml/rss/0,,1,00.xml"],["tf1"]],
                [["http://www.20min.ch/rss/rss.tmpl?type=channel&get=6"],["20minutes"]],
                [["http://www.20min.ch/rss/rss.tmpl?type=channel&get=17"],["20minutes"]],
                [["http://www.francetvinfo.fr/titres.rss"],["francetv"]]]

feeds = []
listemotcle=[]
n=0
for element,journal in listeurl:
    listemotcle.append([])
    MyFeedsConfig =element
    for feed in MyFeedsConfig:
        feeds.append(feedparser.parse(feed))
    for feed in feeds:  
        for item in feed[ "items" ]:
            phrase=item.title
            listemots = phrase.split()
            for elt in listemots:
                elt=elt.lower()
                elt=Enleve_Accents(elt)
                try:
                    if elt[-1]=="s":
                        elt=elt[0:-1]
                    if elt[-1]==","or elt[-1]==".":
                        elt=elt[0:-1]
                    if "'" in elt:
                        elt=elt[2:]
                    if elt[-1]=='"':
Example #55
0
#!/usr/bin/python
# encoding: utf-8
import hashlib, feedparser, requests, httplib, urllib

feed = feedparser.parse('http://www.addic7ed.com/rss.php?mode=hotspot')


# verstuur pushover bericht
def pushover(str):
    conn = httplib.HTTPSConnection("api.pushover.net:443")
    conn.request(
        "POST", "/1/messages.json",
        urllib.urlencode({
            "token": "yourtoken",
            "user": "******",
            "message": str,
        }), {"Content-type": "application/x-www-form-urlencoded"})
    conn.getresponse()
    return


def pushover2(str):
    conn = httplib.HTTPSConnection("api.pushover.net:443")
    conn.request(
        "POST", "/1/messages.json",
        urllib.urlencode({
            "token": "yourtoken",
            "user": "******",
            "message": str,
        }), {"Content-type": "application/x-www-form-urlencoded"})
    conn.getresponse()
Example #56
0
import sys

import feedparser
import db_operations
import read_content

config_file_path = 'datafeeder/conn.config'
section_name = 'rss links'
#prepare database
db = db_operations.db_operations()
if not db.connect():
    exit(2)
print('rss read and database connected')
#read rss links from config file
config_parser = ConfigParser()
config_parser.read(config_file_path)
if(config_parser.has_section(section_name)):
    links = config_parser.items(section_name)

for rss_link in links[0][1].split(','):
    feed = feedparser.parse(rss_link)
    if len(feed.entries) ==0:
        continue
    for post in feed.entries:
        title = post.title
        link = post.link
        if(not db.found_duplicate(link, title)):
            db.insert(rss_link, title, link)

read_content.read_content()                
Example #57
0
def getForecast(profile):
    return feedparser.parse("http://rss.wunderground.com/auto/rss_full/" +
                            str(profile['location']))['entries']
Example #58
0
import feedparser
import re
import sys
import os

# ../mylib.py
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from mylib import print_console


def strip(html):
    return re.sub('<[^<]+?>', '', html)


url = "http://www.priberam.pt/dlpo/DoDiaRSS.aspx"

try:
    f = feedparser.parse(url)
    wotd_l = strip(f["items"][0]["summary"]).split("\n")
except Exception:
    print_console("Error parsing results.")
    exit(-1)

s = "\002%s\002" % wotd_l[0]
for l in wotd_l[1:]:
    if len(l) > 1:
        l = l.strip()
        s += "%s\002;\002 " % (l)

print_console(s)
Example #59
0
def get_urls():
    try:
        urls = [] 
        
        hit_list = [
            'http://braziljournal.com/rss',
            'http://jovempan.uol.com.br/feed',
            'http://www.huffpostbrasil.com/rss/index.xml',
            'http://www.jb.com.br/noticias/rss.xml',
            'http://www.jornaisvirtuais.com.br/feed/',
            'http://aosfatos.org/noticias/feed/',
            'https://apublica.org/feed/',
            'http://br.sputniknews.com/export/rss2/archive/index.xml',
            'https://catracalivre.com.br/feed/',
            'https://www.metropoles.com/feed',
            'http://www.opopular.com.br/cmlink/o-popular-%C3%BAltimas-1.272904',
            'http://altamiroborges.blogspot.com/feeds/posts/default',
            'http://avozeavezdajuventude.blogspot.com/feeds/posts/default',
            'http://blogdoprimo.com.br/feed/',
            'http://blogdoriella.com.br/feed/',
            'http://blogdoskarlack.com/feed/',
            'http://www.jornalcash.com.br/?feed=rss2',
            'http://blogmarcosfrahm.com/feed/',
            'http://congressoemfoco.uol.com.br/feed/',
            'http://elielbezerra.blogspot.com/feeds/posts/default',
            'http://osamigosdopresidentelula.blogspot.com/feeds/posts/default',
            'http://outraspalavras.net/feed/',
            'http://outroladodanoticia.com.br/feed/',
            'http://polibiobraga.blogspot.com/feeds/posts/default',
            'http://poncheverde.blogspot.com/feeds/posts/default',
            'http://previdi.blogspot.com/feeds/posts/default',
            'http://sembalela.com/feed/',
            'http://www.blogdafloresta.com.br/feed/',
            'http://www.blogdoataide.com.br/feed/',
            'http://www.blogdoluciosorge.com.br/feed/',
            'http://www.diariodocentrodomundo.com.br/feed/',
            'http://www.fabiocampana.com.br/feed/',
            'http://www.imprensaviva.com/feeds/posts/default',
            'http://www.jaderbarbalho.com/v3/index.php/feed/',
            'http://www.ma10.com.br/minard/feed/',
            'https://www.ocafezinho.com/feed/',
            'http://www.ocombatente.com/feed/',
            'http://www.politicanarede.com/feeds/posts/default',
            'http://www.redebrasilatual.com.br/ultimas-noticias/atom.xml',
            'http://www.saibamais.jor.br/feed/',
            'http://www.tijolaco.com.br/blog/feed/',
            'http://www.vermelho.org.br/xml/rss_noticias.xml',
            'https://blogdoneylopes.wordpress.com/feed/',
            'http://br.sputniknews.com/export/rss2/archive/index.xml',
            'https://osdivergentes.com.br/feed/',
            'https://www.balaiodokotscho.com.br/feed/',
            'https://www.brasildefato.com.br/rss2.xml',
            'https://www.ceticismopolitico.org/feed/',
            'https://www.domingoscosta.com.br/feed/',
            'https://www.oantagonista.com/feed/',
            'https://jornalivre.com/feed/',
            'http://marcossilverio.blogspot.com/feeds/posts/default',
            'http://mauriciostycer.blogosfera.uol.com.br/feed/',
            'http://www.otvfoco.com.br/feed/',
            'http://www.telesintese.com.br/feed/',
            'http://www.vcfaz.tv/rssnews.php?f=17',
            'http://feed.observatoriodatelevisao.bol.uol.com.br/feed',
            'http://comunicadores.info/feed/',
            'http://portaldapropaganda.com.br/noticias/feed/',
            'http://www.administradores.com.br/rss/noticias/',
            'http://www.bluebus.com.br/feed/',
            'http://www.inteligemcia.com.br/feed/',
            'http://www.papelpop.com/feed/',
            'http://www.updateordie.com/feed/',
            'http://www.mundodomarketing.com.br/feed/rss.xml',
            'https://www.promoview.com.br/feed.rss',
            'http://feeds.feedburner.com/gospelprime',
            'http://justificando.cartacapital.com.br/feed/',
            'https://www.comunique-se.com.br/feed/',
            'https://www.torcedores.com/comments/feed',
            'http://www.portalmidiaesporte.com/feeds/posts/default',
            'http://www.esporteemidia.com/feeds/posts/default',
            'https://blogdopaulinho.com.br/feed/',
            'http://www.mktesportivo.com/feed/',
            'http://www.mtesporte.com.br/rss.php',
            'http://lulacerda.ig.com.br/feed/',
            'https://www.tecmundo.com.br/busca?q=feed',
            'https://www12.senado.leg.br/noticias/feed/todasnoticias/RSS',
            'https://www.ancine.gov.br/pt-br/rss.xml',
            'https://gife.org.br/feed/',
            'http://www.pt.org.br/feed/',
            'http://servicios.lanacion.com.ar/herramientas/rss/origen=2'
            ]
          
        future_calls = [feedparser.parse(rss_url) for rss_url in hit_list]
        
        entries = []
        for feed in future_calls:
            entries.extend(feed["items"])
        
        for entrie in entries:
            href = entrie['link']
            urls.append(href)
        
        return urls
    except:
        raise Exception('Exception in rss_multiplos')
Example #60
0
    def parse(self):

        # fetch etag and last modified
        head_response = requests.head(
            self.podcast.rss, headers=get_headers(), timeout=5
        )
        head_response.raise_for_status()
        headers = head_response.headers

        # if etag hasn't changed then we can skip
        etag = headers.get("ETag")
        if etag and etag == self.podcast.etag:
            return []

        response = requests.get(
            self.podcast.rss, headers=get_headers(), stream=True, timeout=5
        )
        response.raise_for_status()

        data = feedparser.parse(response.content)

        feed = data["feed"]

        entries = {e["id"]: e for e in data.get("entries", []) if "id" in e}.values()
        if not entries:
            return []

        dates = [d for d in [parse_date(e.get("published")) for e in entries] if d]

        now = timezone.now()

        if dates:
            pub_date = max([date for date in dates if date and date < now])

        do_update = (
            pub_date
            and self.podcast.last_updated is None
            or self.podcast.last_updated < pub_date
        )

        if not do_update:
            return []

        if etag:
            self.podcast.etag = etag

        self.podcast.title = feed["title"]
        self.podcast.description = feed["description"]
        self.podcast.language = feed.get("language", "en")[:2].strip().lower()
        self.podcast.explicit = bool(feed.get("itunes_explicit", False))

        if not self.podcast.cover_image:
            image_url = None

            # try itunes image first
            soup = BeautifulSoup(response.content, "lxml")
            itunes_img_tag = soup.find("itunes:image")
            if itunes_img_tag and "href" in itunes_img_tag.attrs:
                image_url = itunes_img_tag.attrs["href"]

            if not image_url:
                try:
                    image_url = feed["image"]["href"]
                except KeyError:
                    pass

            try:
                if image_url and (img := fetch_image_from_url(image_url)):
                    self.podcast.cover_image = img
            except InvalidImageURL:
                pass

        self.podcast.link = feed.get("link")

        categories_dct = get_categories_dict()

        keywords = [t["term"] for t in feed.get("tags", [])]
        categories = [categories_dct[kw] for kw in keywords if kw in categories_dct]

        self.podcast.last_updated = now
        self.podcast.pub_date = pub_date

        keywords = [kw for kw in keywords if kw not in categories_dct]
        self.podcast.keywords = " ".join(keywords)

        authors = set(
            [
                author["name"]
                for author in feed.get("authors", [])
                if "name" in author and author["name"]
            ]
        )

        self.podcast.authors = ", ".join(authors)
        self.podcast.extracted_text = self.extract_text(categories, entries)
        self.podcast.save()

        self.podcast.categories.set(categories)

        new_episodes = self.create_episodes_from_feed(entries)
        if new_episodes:
            self.podcast.pub_date = max(e.pub_date for e in new_episodes)
            self.podcast.save(update_fields=["pub_date"])

        return new_episodes