def post(self): key = self.request.get('key') feed = NewsFeed.get_by_key_name(key) # FIXME check if feed was retrieved result = urlfetch.fetch(feed.url) if result.status_code == 200: rssfeed = feedparser.parse(result.content) for i in rssfeed.entries: item = NewsItem(key_name=i.guid) item.url = i.link item.title = i.title item.text = i.summary item.date = datetime.datetime(*i.date_parsed[:6]) item.orderdate = datetime.datetime(*i.date_parsed[:6]) item.source = feed item.put() feed.last_fetch = datetime.datetime.now() feed.put() taskqueue.add(queue_name='fetch-news-queue', url='/admin/feeds/fetch/', params={'key':feed.key}) self.response.out.write('feed pulled') else: self.error(500)
published_date = arrow.now().to('US/Eastern').floor('day').format( 'YYYY-MM-DD') published_ts = arrow.now().to('US/Eastern').floor('day').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary # Strip out any inline html summary_soup = bs4.BeautifulSoup(summary, 'html.parser') summary = summary_soup.text item.url_hash = url_hash item.link = link item.source = 'NY Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: summary = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] if headline.endswith(' - wunderground.com'): headline = headline.replace(' - wunderground.com', '') item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'Weather Underground' # The author and date are in the same text string parts = tr_el.em.text.strip().split('\n\t\t') if len(parts) == 1: dt = dateutil.parser.parse(parts[0]) dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime('%Y-%m-%d') + 'T00:00:00-05:00' timestamp = arrow.get(dt).to('UTC').timestamp published_date = arrow.get(dt).date().strftime('%Y-%m-%d') item.published_ts = timestamp else: dt = dateutil.parser.parse(parts[1]) dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime('%Y-%m-%d') + 'T00:00:00-05:00' timestamp = arrow.get(dt).to('UTC').timestamp published_date = arrow.get(dt).date().strftime('%Y-%m-%d')
for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
published_date = arrow.get(date).to('US/Eastern').date().strftime( '%Y-%m-%d') published_ts = arrow.get(date).to('UTC').timestamp headline = entry.title summary = entry.summary if headline.startswith('The State of the Atmosphere'): headline = "(Denver) %s" % headline # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() item.url_hash = url_hash item.link = link item.source = 'Weather 5280' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime('%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
if meta_og_title_el is not None: headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: summary = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] if meta_published_el is not None: published_datetime = meta_published_el['content'] dt = arrow.get(published_datetime).datetime dt = dt.replace(tzinfo=pytz.timezone('US/Pacific')) published_ts = arrow.get(dt).to('UTC').timestamp published_date = arrow.get(dt).datetime.strftime('%Y-%m-%d') item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'San Diego Union Tribune' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save() # Sleep between requests time.sleep(1)
published_date = arrow.get(date).to('US/Eastern').date().strftime('%Y-%m-%d') published_ts = arrow.get(date).to('US/Eastern').to('UTC').timestamp # Make sure the date isn't greater than today (it happens) if published_date > arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD'): published_date = arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD') published_ts = arrow.now().to('US/Eastern').floor('day').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Science Daily' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
meta_og_title_el = None meta_og_desc_el = None meta_og_url_el = None meta_published_el = None if meta_og_title_el is not None: headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: summary = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] if meta_published_el is not None: published_datetime = meta_published_el['content'] dt = arrow.get(published_datetime).to('US/Eastern') published_ts = dt.to('UTC').timestamp published_date = dt.date().strftime('%Y-%m-%d') item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'NY Post' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'}) meta_og_url_el = links_soup.find('meta', {'property': 'og:url'}) except Exception, e: meta_og_title_el = None meta_og_desc_el = None meta_og_url_el = None if meta_og_title_el is not None: headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: description = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] item.link = link item.url_hash = url_hash item.title = headline item.summary = description item.source = "Accuweather" item.published_date = published_date item.published_ts = utc_dt.timestamp item.inserted_ts = arrow.utcnow().timestamp item.save() # Sleep between requests time.sleep(1)
# See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash == url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
for prefix in skippable_headline_prefixes: if entry.title.startswith(prefix): prefix_match = True if prefix_match: print 'Skipping story' continue # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() soup = bs4.BeautifulSoup(entry.description, 'html.parser') item.summary = unidecode.unidecode(soup.text.strip()) item.title = unidecode.unidecode(entry.title) item.url_hash = url_hash item.link = link item.authors = '' item.source = 'Capital WX Gang' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
# Make sure the date isn't greater than today (it happens) if published_date > arrow.now().to('US/Eastern').floor('day').format( 'YYYY-MM-DD'): published_date = arrow.now().to('US/Eastern').floor('day').format( 'YYYY-MM-DD') published_ts = arrow.now().to('US/Eastern').floor('day').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Science Daily' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
url_hash = hashlib.md5(link).hexdigest() # See if the item already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() date_string = story_dict['PublishDate'] date_utc = arrow.get(date_string).to('UTC') date_eastern = date_utc.to('US/Eastern') published_date = date_eastern.format('YYYY-MM-DD') published_ts = date_utc.timestamp headline = story_dict['Title'] summary = story_dict['Summary'] item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'WeatherBug' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
meta_og_title_el = links_soup.find('meta', {'property': 'og:title'}) meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'}) meta_og_url_el = links_soup.find('meta', {'property': 'og:url'}) except Exception, e: meta_og_title_el = None meta_og_desc_el = None meta_og_url_el = None if meta_og_title_el is not None: headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: summary = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'Star Tribune' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save() # Sleep between requests to be polite time.sleep(1)
headline = meta_og_title_el['content'].strip() if meta_og_desc_el is not None: summary = meta_og_desc_el['content'].strip() if meta_og_url_el is not None: link = meta_og_url_el['content'] if headline.endswith(' - wunderground.com'): headline = headline.replace(' - wunderground.com', '') item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'Weather Underground' # The author and date are in the same text string parts = tr_el.em.text.strip().split('\n\t\t') if len(parts) == 1: dt = dateutil.parser.parse(parts[0]) dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime( '%Y-%m-%d') + 'T00:00:00-05:00' timestamp = arrow.get(dt).to('UTC').timestamp published_date = arrow.get(dt).date().strftime('%Y-%m-%d') item.published_ts = timestamp else: dt = dateutil.parser.parse(parts[1]) dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime( '%Y-%m-%d') + 'T00:00:00-05:00'
if published_date > arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD'): published_date = arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD') published_ts = arrow.now().to('US/Eastern').floor('day').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary # Strip out any inline html summary_soup = bs4.BeautifulSoup(summary, 'html.parser') summary = summary_soup.text item.url_hash = url_hash item.link = link item.source = 'NY Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
# See if any of the skippable ids are in the story ids if pcollid in skippable_collection_ids: print 'Skipping %s story' % pcollid continue # If it's also published on weather underground, skip it if 'wunderground' in tags: print 'Skipping Weather Underground Story' continue # See if the story already exists try: item = NewsItem.get(NewsItem.url_hash==url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() item.url_hash = url_hash item.title = unidecode.unidecode(entry['title'].strip()) item.summary = unidecode.unidecode(entry['description'].strip()) item.source = "Weather Channel" item.link = link item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
url_hash = hashlib.md5(link).hexdigest() # See if the item already exists try: item = NewsItem.get(NewsItem.url_hash == url_hash) print 'Item Exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating new item.' item = NewsItem() date_string = story_dict['PublishDate'] date_utc = arrow.get(date_string).to('UTC') date_eastern = date_utc.to('US/Eastern') published_date = date_eastern.format('YYYY-MM-DD') published_ts = date_utc.timestamp headline = story_dict['Title'] summary = story_dict['Summary'] item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'WeatherBug' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
# The date string says GMT, but doesn't agree with the date provided. # We need to figure out the current offset and apply that first. date_string = story_dict['Timestamp']['GMT'] offset = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime('%z') date_eastern = arrow.get(date_string + offset).to('US/Eastern') date_utc = date_eastern.to('UTC') published_date = date_eastern.format('YYYY-MM-DD') published_ts = date_utc.timestamp # If the datetime indicates midnight, default it to the current time, at # least it will be closer to accurate since we check every 30 minutes if date_eastern.hour == 0 and date_eastern.minute == 0: print 'Overriding timestamp' date_utc = arrow.utcnow() published_ts = date_utc.timestamp headline = story_dict['Title']['Text'][0] summary = story_dict['IntroText']['Text'][0] item.link = link item.url_hash = url_hash item.title = headline item.summary = summary item.source = 'The Weather Network' item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
date = entry.published_parsed published_date = arrow.get(date).to('US/Eastern').date().strftime('%Y-%m-%d') published_ts = arrow.get(date).to('UTC').timestamp headline = entry.title summary = entry.summary if headline.startswith('The State of the Atmosphere'): headline = "(Denver) %s" % headline # See if we already have this story try: NewsItem.get(NewsItem.url_hash==url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() item.url_hash = url_hash item.link = link item.source = 'Weather 5280' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()
link = entry.link url_hash = hashlib.md5(link).hexdigest() date = entry.published_parsed published_date = arrow.get(date).to('US/Pacific').date().strftime( '%Y-%m-%d') published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp # See if we already have this story try: NewsItem.get(NewsItem.url_hash == url_hash) print 'Item exists. Skipping.' continue except peewee.DoesNotExist: print 'Creating item.' item = NewsItem() headline = entry.title summary = entry.summary item.url_hash = url_hash item.link = link item.source = 'Seattle Times' item.title = headline item.summary = summary item.published_date = published_date item.published_ts = published_ts item.inserted_ts = arrow.utcnow().timestamp item.save()