-
Notifications
You must be signed in to change notification settings - Fork 1
/
ap_scraper.py
executable file
·48 lines (39 loc) · 1.28 KB
/
ap_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python2.7
__author__= "Noah Yoshida"
import feedparser
from datetime import datetime
import urllib
from bs4 import BeautifulSoup as soup
from article import article
def ap_scraper(my_url):
'''
returns a list of article objects from the scraped AP news URL
you can chose different URLs for Associated Press RSS to serve up
'''
# my_url = "http://feeds.bbci.co.uk/news/rss.xml" # URL of rss feed / whatever you need
uClient = urllib.urlopen(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html,'xml')
titles = []
descriptions = []
dates = []
links = []
for item in page_soup.findAll('item'): # I can make this better space efficiency and time
titles.append(item.title.text.encode("utf-8"))
descriptions.append(item.description.text.encode("utf-8"))
dates.append(str(item.pubDate.text))
links.append(item.title.text)
l = [titles, descriptions, dates, links]
article_list = []
for i in range(len(l[0])):
temp = article(l[0][i],l[1][i],l[2][i],l[3][i])
article_list.append(temp)
return article_list
# print l[0][i]
# print l[1][i]
# print l[2][i]
# print
# for thing in article_list:
# print thing
# print