def __init__(self): user_agent_file = get_project_settings()['USER_AGENT_FILE'] json_obj = JsonLoad(user_agent_file) self.agents = json_obj.getlist()
def __init__(self, settings): #print settings self.settings = settings self.log_obj = write_record(self.settings['record_log']) #写日志对象 json_object = JsonLoad(self.settings['user_agent_file']) self.agent_list = json_object.getlist() self.timeout = self.settings['timeout']
def get_sina_comment_2(self,article_item): refer = article_item['article_url'] article_item['article_discuss'] = article_item['article_discuss'].split('?') article_item['article_discuss_number'] = 0 article_item['article_attend_number'] = 0 if len(article_item['article_discuss']) != 2: article_item['article_discuss'] = [] return comment_page = 1 lefturl = article_item['article_discuss'][1] # 存储评论 cmntlist = [] article_item['article_discuss'] = [] #print 'hello wrold' json_object = JsonLoad(self.settings['USER_AGENT_FILE']) agent_list = json_object.getlist() while comment_page == 1 or cmntlist != []: one_user_agent = random.choice(agent_list) headers = { 'User-Agent' : one_user_agent,'Referer':refer} comment_url = self.website_config['comment_url'] +'&' +lefturl + "&page=" + str(comment_page) try: #print comment_url request = urllib2.Request(comment_url,headers = headers) comment_content = urllib2.urlopen(request,timeout = self.settings['DOWNLOAD_TIMEOUT']).read() if comment_content is None: break find_str = "={" extract_contain = comment_content[comment_content.index(find_str) + len(find_str) -1:] extract_contain = extract_contain.replace('null','None') real_content = eval(extract_contain) if 'cmntlist' in real_content['result']: cmntlist = real_content['result']['cmntlist'] else: cmntlist = [] if cmntlist != []: article_item['article_discuss'].append(cmntlist) if comment_page == 1 and ('count' in real_content['result']): #print real_content article_item['article_discuss_number'] = int(real_content['result']['count']['show']) article_item['article_attend_number'] = int(real_content['result']['count']['total']) comment_page = comment_page + 1 except BaseException,error: date=datetime.datetime.now() sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + " comment_url:" + comment_url +" " +str(error)+"\n" filename = self.settings['WRONG_FILE'] with open(filename,'a') as f: f.write(sendbody) break
def __init__(self, setting_file): self.filename = settings_file self.json_object = JsonLoad(settings_file) self.settings = self.json_object.getdata() #print("配置文件:%s" %(self.settings)) #self.db_operation = news_operation() self.db_operation = news_operation(self.settings['record_log']) self.spider_operation = Spider_Operation(self.settings) self.vector_generator = BasePreProcessItem() self.time_operation = TimeOperate()
def get_fenghuangwang_comment(self,article_item): comment_url = self.website_config['discuss_url'] + article_item['article_url'] json_object = JsonLoad(self.settings['USER_AGENT_FILE']) agent_list = json_object.getlist() one_user_agent = random.choice(agent_list) headers = { 'User-Agent' : one_user_agent,'Referer':article_item['article_url']} article_item['article_discuss'] = [] article_item['article_discuss_number'] = 0 article_item['article_attend_number'] = 0 try: request = urllib2.Request(comment_url,headers = headers) comment_content = urllib2.urlopen(request,timeout = self.settings['DOWNLOAD_TIMEOUT']).read() if not comment_content: raise Exception("获取评论出错") find_str = "={" extract_contain = comment_content[comment_content.index(find_str) + len(find_str) -1:] #real_content = eval(extract_contain) extract_contain = extract_contain.replace('null','None') extract_contain = extract_contain.replace('false','None') #print len(extract_contain) real_content = extract_contain[0:len(extract_contain)-1] #去掉";" real_content = eval(real_content) #转化成字典 #print real_content['count'] article_item['article_discuss'] = real_content['comments'] article_item['article_discuss_number'] = int(real_content['count']) article_item['article_attend_number'] = int(real_content['join_count']) except Exception,error: date=datetime.datetime.now() sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + " comment_url:" + comment_url +" " +str(error)+"\n" filename = self.settings['WRONG_FILE'] with open(filename,'a') as f: f.write(sendbody)
class Re_Spider(object): def __init__(self, setting_file): self.filename = settings_file self.json_object = JsonLoad(settings_file) self.settings = self.json_object.getdata() #print("配置文件:%s" %(self.settings)) #self.db_operation = news_operation() self.db_operation = news_operation(self.settings['record_log']) self.spider_operation = Spider_Operation(self.settings) self.vector_generator = BasePreProcessItem() self.time_operation = TimeOperate() def day_by_day_update(self): try: if self.settings['start_time'] == "" or self.settings[ 'stop_time'] == "": raise Exception("请配置起始时间") update_time = self.time_operation.str2date( self.settings['start_time']) while str(update_time) >= self.settings['stop_time']: self.settings['start_time'] = str(update_time) print("进度:%s" % (update_time)) self.settings['extract_condition'][ 'article_publish_time'] = str(update_time) self.__spider_for_page() self.db_operation.reset_para() update_time = self.time_operation.getthepreviousday( update_time) return True except BaseException, error: self.__error_email_info(error) return False finally:
import sys reload(sys) sys.setdefaultencoding('utf8') if __name__ == '__main__': try: #read_json_file = JsonLoad(SPLIT_JSON_FILE) fenghuang_update_object = FengHuangSettings() settings = get_project_settings() fenghuang_update_object.updatesettings(settings) #print settings #print settings['REQUEST_DEPTH_MAX'] email_object = Email(settings) configure_logging(settings) read_json_file = JsonLoad(settings['SPLIT_JSON_FILE']) json_data = read_json_file.getdata() runner = CrawlerRunner(settings) # 开始爬虫,为了统计爬虫的时间 begin_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 开始时间 for json_key in json_data: website_config = json_data[json_key] # 取出每个网站的配置 website_url = website_config['url'] #website_urls = website_config[urls] # #website_urls = website_config['urls'] # 取每个网站的urls(每一项是地点的url) #print website_urls #settings['PREPROCESS_CLASS'] = web_config['preprocess_class' #logging.info('开始网站爬虫'+json_key+':''-'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
def run_sina(): try: #print settings['LOG_FILE'] read_json_file = JsonLoad(settings['SINA_JSON_FILE']) configure_logging(settings) json_data = read_json_file.getdata() #runner = CrawlerRunner(settings) begin_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) # 开始时间 #current_date = time_strftime("%Y-%m-%d",time.localtime()) # 当天日期 logging.info('爬虫新浪网开始时间:'+begin_time) time_operation = TimeOperate() if json_data['start_time'] == '': yesterday_date = time_operation.getyesterdaydate() #获得昨天的日期 else: yesterday_date = time_operation.str2date(json_data['start_time']) temp_begin_spider_date = yesterday_date if str(temp_begin_spider_date) == json_data['stop_time']: logging.info('爬虫新浪网结束时间:'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())) os._exit(0) while True: if str(yesterday_date) <= json_data['stop_time'] or str(yesterday_date) < settings['SINA_OLD_START_DATE']: #结束新浪的爬取 #print '' break #deal_class = '' web_config = '' if str(yesterday_date) >= settings['SINA_NEW_START_DATE']: web_config = json_data['new_version'] #deal_class = json_data['new_version'] else: web_config = json_data['old_version'] #day = day+1 deal_class = web_config['deal_class'] #settings['PREPROCESS_CLASS'] = web_config['preprocess_class'] logging.info('开始爬取日期:'+str(yesterday_date)) print str(yesterday_date) #begin_at = begin_at + 1 yield runner.crawl(globals()[deal_class],website_config = web_config, spider_date = yesterday_date,settings = settings ) yesterday_date = time_operation.getthepreviousday(yesterday_date) # 日期推前一天 reactor.stop() json_data['stop_time'] = str(temp_begin_spider_date) #更新停止时间 end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间 info_spider = ' begin at :'+begin_time+' end at :'+end_time logging.info(info_spider) sendbody = "time:"+ end_time + "新浪网爬虫结束" +"\n" email_object.send_information(sendbody,"新浪网爬虫结束",True) os._exit(0) except BaseException,error: #date = datetime.datetime.now() time_object = TimeOperate() date = time_object.getnow() logging.exception(error) sendbody = "time:" + date.strftime("%Y-%m-%d %H:%M:%S") + "error:" + str(error) + "\n" #email_object = Email(settings) email_object.send_information(sendbody) raise CloseSpider('新浪爬虫失败') os._exit(1)