Exemple #1
0
 def post(self):
     key = self.request.get('key')
     feed = NewsFeed.get_by_key_name(key)
     # FIXME check if feed was retrieved
     result = urlfetch.fetch(feed.url)
     if result.status_code == 200:
         rssfeed = feedparser.parse(result.content)
         for i in rssfeed.entries:
             item = NewsItem(key_name=i.guid)
             item.url = i.link
             item.title = i.title
             item.text = i.summary
             item.date = datetime.datetime(*i.date_parsed[:6])
             item.orderdate = datetime.datetime(*i.date_parsed[:6])
             item.source = feed
             item.put()
         feed.last_fetch = datetime.datetime.now() 
         feed.put() 
         taskqueue.add(queue_name='fetch-news-queue', url='/admin/feeds/fetch/', params={'key':feed.key})
         self.response.out.write('feed pulled')
     else:  
         self.error(500)
Exemple #2
0
        published_date = arrow.now().to('US/Eastern').floor('day').format(
            'YYYY-MM-DD')
        published_ts = arrow.now().to('US/Eastern').floor('day').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    # Strip out any inline html
    summary_soup = bs4.BeautifulSoup(summary, 'html.parser')
    summary = summary_soup.text

    item.url_hash = url_hash
    item.link = link
    item.source = 'NY Times'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if headline.endswith(' - wunderground.com'):
        headline = headline.replace(' - wunderground.com', '')

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'Weather Underground'
    
    # The author and date are in the same text string
    parts = tr_el.em.text.strip().split('\n\t\t')
    
    if len(parts) == 1:
        dt = dateutil.parser.parse(parts[0])
        dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime('%Y-%m-%d') + 'T00:00:00-05:00'
        timestamp = arrow.get(dt).to('UTC').timestamp
        published_date = arrow.get(dt).date().strftime('%Y-%m-%d')
        item.published_ts = timestamp
    else:
        dt = dateutil.parser.parse(parts[1])
        dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime('%Y-%m-%d') + 'T00:00:00-05:00'
        timestamp = arrow.get(dt).to('UTC').timestamp
        published_date = arrow.get(dt).date().strftime('%Y-%m-%d')
    for prefix in skippable_headline_prefixes:
        if entry.title.startswith(prefix):
            prefix_match = True
    
    if prefix_match:
        print 'Skipping story'
        continue
    
    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    soup = bs4.BeautifulSoup(entry.description, 'html.parser')
    item.summary = unidecode.unidecode(soup.text.strip())
    item.title = unidecode.unidecode(entry.title)

    item.url_hash = url_hash
    item.link = link
    item.authors = ''
    item.source = 'Capital WX Gang'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
Exemple #5
0
    published_date = arrow.get(date).to('US/Eastern').date().strftime(
        '%Y-%m-%d')
    published_ts = arrow.get(date).to('UTC').timestamp

    headline = entry.title
    summary = entry.summary

    if headline.startswith('The State of the Atmosphere'):
        headline = "(Denver) %s" % headline

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    item.url_hash = url_hash
    item.link = link
    item.source = 'Weather 5280'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    
    link = entry.link
    url_hash = hashlib.md5(link).hexdigest()
    date = entry.published_parsed

    published_date = arrow.get(date).to('US/Pacific').date().strftime('%Y-%m-%d')
    published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    item.url_hash = url_hash
    item.link = link
    item.source = 'Seattle Times'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if meta_published_el is not None:
        published_datetime = meta_published_el['content']
        dt = arrow.get(published_datetime).datetime
        dt = dt.replace(tzinfo=pytz.timezone('US/Pacific'))
        published_ts = arrow.get(dt).to('UTC').timestamp
        published_date = arrow.get(dt).datetime.strftime('%Y-%m-%d')

    item.link = link
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'San Diego Union Tribune'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()

    # Sleep between requests
    time.sleep(1)
    published_date = arrow.get(date).to('US/Eastern').date().strftime('%Y-%m-%d')
    published_ts = arrow.get(date).to('US/Eastern').to('UTC').timestamp

    # Make sure the date isn't greater than today (it happens)
    if published_date > arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD'):
        published_date = arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD')
        published_ts = arrow.now().to('US/Eastern').floor('day').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    item.url_hash = url_hash
    item.link = link
    item.source = 'Science Daily'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
        meta_og_title_el = None
        meta_og_desc_el = None
        meta_og_url_el = None
        meta_published_el = None

    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()
        
    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if meta_published_el is not None:
        published_datetime = meta_published_el['content']
        dt = arrow.get(published_datetime).to('US/Eastern')
        published_ts = dt.to('UTC').timestamp
        published_date = dt.date().strftime('%Y-%m-%d')

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'NY Post'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
        meta_og_title_el = links_soup.find('meta', {'property': 'og:title'})
        meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'})
        meta_og_url_el = links_soup.find('meta', {'property': 'og:url'})
    except Exception, e:
        meta_og_title_el = None
        meta_og_desc_el = None
        meta_og_url_el = None

    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        description = meta_og_desc_el['content'].strip()
        
    if meta_og_url_el is not None:
        link = meta_og_url_el['content']
    
    item.link = link
    item.url_hash = url_hash
    item.title = headline
    item.summary = description
    item.source = "Accuweather"
    item.published_date = published_date
    item.published_ts = utc_dt.timestamp
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    
    # Sleep between requests
    time.sleep(1)
    # See if any of the skippable ids are in the story ids
    if pcollid in skippable_collection_ids:
        print 'Skipping %s story' % pcollid
        continue

    # If it's also published on weather underground, skip it
    if 'wunderground' in tags:
        print 'Skipping Weather Underground Story'
        continue

    # See if the story already exists
    try:
        item = NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item Exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating new item.'
        item = NewsItem()

    item.url_hash = url_hash
    item.title = unidecode.unidecode(entry['title'].strip())
    item.summary = unidecode.unidecode(entry['description'].strip())
    item.source = "Weather Channel"
    item.link = link
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
Exemple #12
0
    for prefix in skippable_headline_prefixes:
        if entry.title.startswith(prefix):
            prefix_match = True

    if prefix_match:
        print 'Skipping story'
        continue

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    soup = bs4.BeautifulSoup(entry.description, 'html.parser')
    item.summary = unidecode.unidecode(soup.text.strip())
    item.title = unidecode.unidecode(entry.title)

    item.url_hash = url_hash
    item.link = link
    item.authors = ''
    item.source = 'Capital WX Gang'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    # Make sure the date isn't greater than today (it happens)
    if published_date > arrow.now().to('US/Eastern').floor('day').format(
            'YYYY-MM-DD'):
        published_date = arrow.now().to('US/Eastern').floor('day').format(
            'YYYY-MM-DD')
        published_ts = arrow.now().to('US/Eastern').floor('day').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    item.url_hash = url_hash
    item.link = link
    item.source = 'Science Daily'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    url_hash = hashlib.md5(link).hexdigest()

    # See if the item already exists
    try:
        item = NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item Exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating new item.'
        item = NewsItem()

    date_string = story_dict['PublishDate']
    date_utc = arrow.get(date_string).to('UTC')
    date_eastern = date_utc.to('US/Eastern')
    published_date = date_eastern.format('YYYY-MM-DD')
    published_ts = date_utc.timestamp

    headline = story_dict['Title']
    summary = story_dict['Summary']

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'WeatherBug'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
        meta_og_title_el = links_soup.find('meta', {'property': 'og:title'})
        meta_og_desc_el = links_soup.find('meta', {'property': 'og:description'})
        meta_og_url_el = links_soup.find('meta', {'property': 'og:url'})
    except Exception, e:
        meta_og_title_el = None
        meta_og_desc_el = None
        meta_og_url_el = None

    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'Star Tribune'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    
    # Sleep between requests to be polite
    time.sleep(1)
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if headline.endswith(' - wunderground.com'):
        headline = headline.replace(' - wunderground.com', '')

    item.link = link
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'Weather Underground'

    # The author and date are in the same text string
    parts = tr_el.em.text.strip().split('\n\t\t')

    if len(parts) == 1:
        dt = dateutil.parser.parse(parts[0])
        dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime(
            '%Y-%m-%d') + 'T00:00:00-05:00'
        timestamp = arrow.get(dt).to('UTC').timestamp
        published_date = arrow.get(dt).date().strftime('%Y-%m-%d')
        item.published_ts = timestamp
    else:
        dt = dateutil.parser.parse(parts[1])
        dt = dt.replace(tzinfo=pytz.timezone('US/Eastern')).strftime(
            '%Y-%m-%d') + 'T00:00:00-05:00'
    if published_date > arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD'):
        published_date = arrow.now().to('US/Eastern').floor('day').format('YYYY-MM-DD')
        published_ts = arrow.now().to('US/Eastern').floor('day').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    # Strip out any inline html
    summary_soup = bs4.BeautifulSoup(summary, 'html.parser')
    summary = summary_soup.text

    item.url_hash = url_hash
    item.link = link
    item.source = 'NY Times'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    
    # See if any of the skippable ids are in the story ids
    if pcollid in skippable_collection_ids:
        print 'Skipping %s story' % pcollid
        continue

    # If it's also published on weather underground, skip it
    if 'wunderground' in tags:
        print 'Skipping Weather Underground Story'
        continue

    # See if the story already exists
    try:
        item = NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item Exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating new item.'
        item = NewsItem()

    item.url_hash = url_hash
    item.title = unidecode.unidecode(entry['title'].strip())
    item.summary = unidecode.unidecode(entry['description'].strip())
    item.source = "Weather Channel"
    item.link = link
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    url_hash = hashlib.md5(link).hexdigest()

    # See if the item already exists
    try:
        item = NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item Exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating new item.'
        item = NewsItem()

    date_string = story_dict['PublishDate']
    date_utc = arrow.get(date_string).to('UTC')
    date_eastern = date_utc.to('US/Eastern')
    published_date = date_eastern.format('YYYY-MM-DD')
    published_ts = date_utc.timestamp

    headline = story_dict['Title']
    summary = story_dict['Summary']

    item.link = link
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'WeatherBug'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
Exemple #20
0
    # The date string says GMT, but doesn't agree with the date provided. 
    # We need to figure out the current offset and apply that first.
    date_string = story_dict['Timestamp']['GMT']
    offset = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime('%z')
    date_eastern = arrow.get(date_string + offset).to('US/Eastern')
    date_utc = date_eastern.to('UTC')
    published_date = date_eastern.format('YYYY-MM-DD')
    published_ts = date_utc.timestamp
    
    # If the datetime indicates midnight, default it to the current time, at
    # least it will be closer to accurate since we check every 30 minutes
    if date_eastern.hour == 0 and date_eastern.minute == 0:
        print 'Overriding timestamp'
        date_utc = arrow.utcnow()
        published_ts = date_utc.timestamp

    headline = story_dict['Title']['Text'][0]
    summary = story_dict['IntroText']['Text'][0]

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'The Weather Network'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    date = entry.published_parsed

    published_date = arrow.get(date).to('US/Eastern').date().strftime('%Y-%m-%d')
    published_ts = arrow.get(date).to('UTC').timestamp

    headline = entry.title
    summary = entry.summary

    if headline.startswith('The State of the Atmosphere'):
        headline = "(Denver) %s" % headline

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash==url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    item.url_hash = url_hash
    item.link = link
    item.source = 'Weather 5280'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if meta_published_el is not None:
        published_datetime = meta_published_el['content']
        dt = arrow.get(published_datetime).datetime
        dt = dt.replace(tzinfo=pytz.timezone('US/Pacific'))
        published_ts = arrow.get(dt).to('UTC').timestamp
        published_date = arrow.get(dt).datetime.strftime('%Y-%m-%d')

    item.link = link 
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'San Diego Union Tribune'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()

    # Sleep between requests
    time.sleep(1)

    link = entry.link
    url_hash = hashlib.md5(link).hexdigest()
    date = entry.published_parsed

    published_date = arrow.get(date).to('US/Pacific').date().strftime(
        '%Y-%m-%d')
    published_ts = arrow.get(date).to('US/Pacific').to('UTC').timestamp

    # See if we already have this story
    try:
        NewsItem.get(NewsItem.url_hash == url_hash)
        print 'Item exists. Skipping.'
        continue
    except peewee.DoesNotExist:
        print 'Creating item.'
        item = NewsItem()

    headline = entry.title
    summary = entry.summary

    item.url_hash = url_hash
    item.link = link
    item.source = 'Seattle Times'
    item.title = headline
    item.summary = summary
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()
Exemple #24
0
        meta_og_title_el = None
        meta_og_desc_el = None
        meta_og_url_el = None
        meta_published_el = None

    if meta_og_title_el is not None:
        headline = meta_og_title_el['content'].strip()

    if meta_og_desc_el is not None:
        summary = meta_og_desc_el['content'].strip()

    if meta_og_url_el is not None:
        link = meta_og_url_el['content']

    if meta_published_el is not None:
        published_datetime = meta_published_el['content']
        dt = arrow.get(published_datetime).to('US/Eastern')
        published_ts = dt.to('UTC').timestamp
        published_date = dt.date().strftime('%Y-%m-%d')

    item.link = link
    item.url_hash = url_hash
    item.title = headline
    item.summary = summary
    item.source = 'NY Post'
    item.published_date = published_date
    item.published_ts = published_ts
    item.inserted_ts = arrow.utcnow().timestamp

    item.save()