def __init__(self, hit_list): # pull down all feeds future_calls = [Future(feedparser.parse,rss_url) for rss_url in hit_list] # block until they are all in feeds = [future_obj() for future_obj in future_calls] entries = [] for feed in feeds: entries.extend(feed[ "items" ]) sorted_entries = sorted(entries, key=lambda entry: entry['published']) sorted_entries.reverse() # for most recent entries first for e in entries: #print "Iissssss" + e['title'] summary = re.split("<", e['summary']) indexer.index_my_news(e['title'],e['link'],summary[0], "..//indexdir", True)
def parseFeeds(self): # print "esta aqui" # print Searcher().getcount() adj = self.getAdj() res = Resources() classifier = self.getClassifier(adj) hit_list = [ "http://feeds.jn.pt/JN-Politica", "http://feeds.dn.pt/DN-Politica", "http://economico.sapo.pt/rss/politica" ] # list of feeds to pull down future_calls = [feedparser.parse(rss_url) for rss_url in hit_list] # block until they are all in feeds = [x for x in future_calls] print feeds entries = [] for feed in feeds: entries.extend(feed[ "items" ]) for e in entries: n_title = e['title'] # print "vai adicionar", n_title n_link = e['link'] summary = re.split("<", e['summary']) n_content = summary[0] try: entidades = res.encontraNomes(n_title + "" + n_content, ''); except UnicodeEncodeError: print "Erro a encontrar entidades" try: #indexer.clean_index("libra/politics/indexdir") thisid = indexer.index_my_news(n_title,n_link,n_content, "libra/politics/indexdir", False) n_id = thisid print "searcher ",Searcher().getcount(), thisid if thisid: news = News.objects.create_Simple_News(n_id, n_title, n_link, n_content) Future(self.insereBD,entidades, news, adj, classifier) except TypeError: print n_title