def main(): global baidunewsConf, baidunewsCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() #2. load baidunews config baidunewsConf = loadConfig(configFile) channelid = baidunewsConf Channels = getChannels(channelid) baidunewsCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler print("Start baidunewsCrawler ...") for channel in Channels: getNewsList(channel)
def main(): if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") global neteaseConf, ChannelIds, neteaseCrawlerQueue, CurrentTime, TenDaysAgoTime #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() #2. load netease config neteaseConf = loadConfig(configFile) ChannelIds = neteaseConf["ChannelIds"] neteaseCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), \ ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #3. start to crawler ##ChannelIds = {"video":"T1457068979049"} for channelid in ChannelIds: getChannelNewsList(channelid)
def main(): global config, weiboCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime, weibolinkQueue if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() tracksource = appConf.getTrackSource() tracksourceHost = tracksource.getHost() tracksourcePort = tracksource.getPort() #2. load weibo config Channels = getweiboChannels(tracksourceHost, tracksourcePort) config = ConfigParser.ConfigParser() config.read(configFile) weiboCrawlerQueue = appCrawlerQueue(\ ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) weibolinkQueue = appCrawlerQueue(\ ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getLinkQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler appLogger.info("Start weiboCrawler ...") start = time.time() for channel in Channels: if channel[ 'name'] in '新华网 央视新闻 北京日报 新浪视频 中安在线 人民日报 重庆晚报 泰州晚报 北京晨报 南通网': print channel['name'] getNewsList(channel) end = time.time() #crawl timeline print end - start
def main(): global tencentConf, tencentCrawlerQueue, ChannelIds, CurrentTime, TenDaysAgoTime, tencentlinkQueue if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() #2. load tencent config tencentConf = loadConfig(configFile) ChannelIds = tencentConf["ChannelIds"] #tencentCrawlerQueue = appCrawlerQueue (tencentConf["amqpurl"],tencentConf["request_queue"], tencentConf["request_queue"], tencentConf["request_queue"]) #tencentlinkQueue = appCrawlerQueue (tencentConf["amqpurl"],tencentConf["outerlink_queue"], tencentConf["outerlink_queue"], tencentConf["outerlink_queue"]) tencentCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) tencentlinkQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getLinkQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #3. start to crawler for channelid in ChannelIds: getChannelNewsList(channelid)
def main(): global wechatConf, wechatCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() tracksource = appConf.getTrackSource() tracksourceHost = tracksource.getHost() tracksourcePort = tracksource.getPort() #2. load wechat config try: Channels = getWechatChannels(tracksourceHost, tracksourcePort) except: appLogger.error(traceback.format_exc()) wechatConf = loadConfig(configFile) Channels = wechatConf['channels'] wechatCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler appLogger.info("Start wechatCrawler ...") print len(Channels) start = time.time() channel_list = '太行日报 新华视点 淮北日报 新华视界 新华国际 微黔江 天津日报 人民日报 上海观察 深圳特区报' for channel in Channels: if channel['name'] in channel_list: #print channel['name'] getNewsList(channel) time.sleep(300) end = time.time() #crawl timeline print end - start
def main(): global sohuConf, sohuCrawlerQueue, ChannelIds, CurrentTime, TenDaysAgoTime if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() #2. load sohu config sohuConf = loadConfig(configFile) ChannelIds = sohuConf["ChannelIds"] #sohuCrawlerQueue = appCrawlerQueue (sohuConf["amqpurl"],sohuConf["request_queue"], sohuConf["request_queue"], sohuConf["request_queue"]) sohuCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #3.start crawler print("Start sohuCrawler ...") for channelid in ChannelIds: #videos if int(ChannelIds[channelid]) == 36: #continue getVideoList() #Album if int(ChannelIds[channelid]) == 47 or int( ChannelIds[channelid]) == 54: #continue getAlbumList(channelid) #news getNewsList(channelid)
def main(): global xinhuaConf, xinhuaCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() tracksource = appConf.getTrackSource() tracksourceHost = tracksource.getHost() tracksourcePort = tracksource.getPort() #2. load xinhua config xinhuaCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler appLogger.info("Start xinhuaCrawler ...") for i in range(3): videodate = DateFormat(int(time.time()) - 3600 * 24 * i)[:10] getNewsList(videodate)