class TweetFetcher(): def __init__(self, consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_token_secret=config.access_token_secret): self.auth = OAuth1(consumer_key, consumer_secret, access_token, access_token_secret) self.session = requests.Session() self.db_interface = MongoDao() def fetch_by_users(self, twitter_handles): params = {'track' : ",".join(['@' + h for h in twitter_handles])} stream = self.session.post(url=config.api_url, auth=self.auth, data=params, stream=True) for line in stream.iter_lines(): if line: print 'record found' post = json.loads(line) post.update(self._compute_sentiment(post['text'])) self.db_interface.insert(post) print 'inserted record from {}'.format(post['user']['screen_name']) def _compute_sentiment(self, text): return vader(text.encode('utf8'))
async def start(): """ 启动爬虫 :return: """ logging.info("启动爬虫,更新小说 ...") # 从数据库中检索所有小说 novels = [] with MongoDao() as dao: # 从数据库中查询所有小说列表 novels += novel_iterator(dao) if not novels: return async with aiohttp.ClientSession() as session: # 遍历小说,依次更新之 for novel in novels: logging.debug("正在爬取更新 《%s》 ", novel["name"]) # 遍历小说列表,依次检查其是否有更新 # {'number': '20380548', 'original_url': 'https://www.biquge5200.cc/52_52542/20380548.html'} novel_id = str(novel["_id"]) # 获取最新章节,以源url作为更新标志位 with MongoDao() as dao: chapter = dao.get_latest_chapter(novel_id) # 如有更新,则将更新部分爬取下来,入库 new_urls = await chapter_urls(session, novel["origin_url"], chapter and chapter["origin_url"]) for new_url in new_urls: await update_chapter(session, new_url[0], new_url[1], novel_id) logging.info("停止爬虫,更新结束 ...")
async def update_chapter(aiohttp_session, number, chapter_origin_url, novel_id): """ 更新章节信息 :param aiohttp_session: :param number: :param chapter_origin_url: :param novel_id: :return: """ try: html = await download(aiohttp_session, chapter_origin_url) if html: logging.info("更新章节:{}".format(chapter_origin_url)) # ($title, $content, $url) results = await parse_chapter(html, chapter_origin_url) if results: with MongoDao() as dao: dao.save_chapter(novel_id, {"number": number, "title": results[0], "content": results[1], "origin_url": results[2]}) except BaseException: logging.error("更新章节[{}]出错!".format(chapter_origin_url))
def __init__(self, consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_token_secret=config.access_token_secret): self.auth = OAuth1(consumer_key, consumer_secret, access_token, access_token_secret) self.session = requests.Session() self.db_interface = MongoDao()
def query_novel(novel_id): with MongoDao() as dao: # 查询小说信息 data = dao.get_novel(novel_id) return render_template('catalog.html', novel=utils.convert_id_string(data["novel"]), chapters=map(utils.convert_id_string, data["chapters"]))
def query_novels(): """ 小说列表 :return: """ with MongoDao() as dao: novels = dao.list_novel() return render_template('index.html', novels=novels)
def query_chapter(novel_id, chapter_id): with MongoDao() as dao: result = dao.get_chapter(novel_id, chapter_id) if not result: abort(404) return chapter = result["curr_chapter"] # 由于是网页显示用,所以把换行转换为换行标签 chapter["content"] = chapter["content"].replace("\n", "<br>") return render_template( 'chapter.html', chapter=chapter, novel_id=novel_id, prev_id=result["prev_chapter"] and str(result["prev_chapter"]["_id"]) or None, next_id=result["next_chapter"] and str(result["next_chapter"]["_id"]) or None)
import logging, sys import newspaper import simplejson as json from util import LoggerConfig from newspaper import Config, Source from dao import MongoDao from concurrent.futures import ThreadPoolExecutor, wait, as_completed logging.basicConfig(**LoggerConfig.logger_config) logger = logging.getLogger(__name__) newsDao = MongoDao.NewsArticleDao() class NewsCrawlerConfig(object): SITE_FILE = "config/news_sites.json" CRAWL_FILE = "config/crawl_options.json" class NewsSite(object): def __init__(self, name, url, crawl_threads): self.name = name self.url = url self.crawl_threads = crawl_threads def __init__(self): self.sites = [] self.crawl_option = Config() self.is_config_read = False def as_newscrawler(self, site_obj):