def crawlFeed(source, feedName, feedUrl): # Given a feedname from a Source and a URL of where the Rss feed is, # Get the new articles, with their basic params: Title, URL, Publish Time startStamp = loadLastStamp(feedName) html = urlopen(feedUrl).read() epoch = datetime(1970, 1, 1).replace(tzinfo=pytz.utc) soup = BeautifulSoup(html, "html.parser") latestStamp = startStamp newArticles = [] for it in soup.find_all("item"): dt = extractPubTime(it) guid = extractGuid(it, source) timestamp = (dt - epoch).total_seconds() # Hacky way of going from Datetime object to timestamp if timestamp > startStamp: # new article latestStamp = max(timestamp, latestStamp) url = extractLink(it) newArticles.append(Article(guid, it.title.text, url, timestamp, source, feedName, "", "")) else: break # we're done, this assumes articles are ordered by descending pubDate newArticles = crawlContent(newArticles) # crawls for content, img and possible keywords (?) saveNewArticles(newArticles) # save to Database print feedName, " => +" + str(len(newArticles)) saveLastStamp(feedName, latestStamp) # save to not reload articles
def crawlFeed(source, feedName, feedUrl): # Given a feedname from a Source and a URL of where the Rss feed is, # Get the new articles, with their basic params: Title, URL, Publish Time startStamp = loadLastStamp(feedName) html = urlopen(feedUrl).read() epoch = datetime(1970, 1, 1).replace(tzinfo=pytz.utc) soup = BeautifulSoup(html, 'html.parser') latestStamp = startStamp newArticles = [] for it in soup.find_all('item'): dt = extractPubTime(it) guid = extractGuid(it, source) timestamp = (dt - epoch).total_seconds( ) # Hacky way of going from Datetime object to timestamp if timestamp > startStamp: # new article latestStamp = max(timestamp, latestStamp) url = extractLink(it) newArticles.append( Article(guid, it.title.text, url, timestamp, source, feedName, '', '')) else: break # we're done, this assumes articles are ordered by descending pubDate newArticles = crawlContent( newArticles) # crawls for content, img and possible keywords (?) saveNewArticles(newArticles) # save to Database print feedName, " => +" + str(len(newArticles)) saveLastStamp(feedName, latestStamp) # save to not reload articles
ski = int(sys.argv[2]) a = db.qdoc.find({ '$query': { 'source': source }, '$orderby': { 'timestamp': -1 } }).limit(1).skip(ski) a = db.qdoc.find({ '$query': { 'keywords': 'print' }, '$orderby': { 'timestamp': -1 } }).limit(1).skip(ski) a = a[0] art = [ Article(a['_id'], a['title'], a['url'], a['timestamp'], a['source'], a['feed']) ] art = crawlContent(art) with open("new_content.txt", "w") as f: f.write(art[0].content) with open("old_content.txt", "w") as f: f.write(a['content'].encode('utf-8')) print "Done"
from dbco import * from article import * from crawlContent import * import sys # articles = db.qdoc.find({'content': ''}) # for a in articles: # art = [Article('blabla', a['title'], a['url'], 1431243710, a['source'], "cnn_world")] # art = crawlContent(art) # new_cont = art[0].content # db.qdoc.update({'guid': a['guid']}, {'$set': {'content': new_cont}}, multi=True) # print "Done" source = sys.argv[1] ski = int(sys.argv[2]) a = db.qdoc.find({'$query': {'source': source}, '$orderby': {'timestamp': -1}}).limit(1).skip(ski) a = db.qdoc.find({'$query': {'keywords': 'print'}, '$orderby': {'timestamp': -1}}).limit(1).skip(ski) a = a[0] art = [Article(a['_id'], a['title'], a['url'], a['timestamp'], a['source'], a['feed'])] art = crawlContent(art) with open("new_content.txt", "w") as f: f.write(art[0].content) with open("old_content.txt", "w") as f: f.write(a['content'].encode('utf-8')) print "Done"
from dbco import * from crawlContent import * import article from article import * print db.qdoc.find({'content': ''}).count() art = db.qdoc.find({'content': ''}) for ar in art: a = [Article('blabla', "blabla2", ar['url'], 1431243710, 'cnn', "cnnyolo")] a = crawlContent(a) db.qdoc.update({'url': ar['url']}, {'$set': {'content': a[0].content}}) # a[0].content