def catch_url(driver, url, temp_date1, temp_date2, date_end, start_point=1 ): while (temp_date2 <= date_end): data_confirm="n" day_interval=10 while data_confirm != "y": driver=makesure_page(driver,url,temp_date1,temp_date2) flag=0 while flag==0: try: data_summary=driver.find_element_by_css_selector("td.pager").text flag=1 except : print "error in finding element" driver=makesure_page(driver,url,temp_date1,temp_date2) time.sleep(5) if flag==1: print "get the info" break print "#"+data_summary+"#" page_info=re.findall("[-+]?\d+[\.]?\d*", data_summary) num_page= page_info[0] total_record=page_info[-1] # check the number of pages if int(num_page) <= 200: data_confirm ="y" else : temp_date2=temp_date2-timedelta(days=int(day_interval/2)) if temp_date2<temp_date1 : print "error, the information is too big, we need to start with day by day " temp_date2=temp_date1+timedelta(days=1) #------------------------------------------------------------------------------------------------------------------------------------------# # next part is to scrap the url link #------------------------------------------------------------------------------------------------------------------------------------------# print "current date range is :",str(temp_date1)+"---"+str(temp_date2) data_pages=int(data_summary[data_summary.index(u'共')+1:data_summary.index(u'页')]) file_name =temp_date1.strftime("%Y-X%m-X%d").replace('X0','X').replace('X','') if start_point== 1 : write_format='w' else : write_format='a+' export =codecs.open(".\\url\\"+file_name+".txt",write_format,"utf-8") # log record try: for i in range(start_point,data_pages+1): href_flag=0 while href_flag==0: num_link=execute_link(driver, i, data_summary,url,temp_date1,temp_date2) href_flag=1 print i,len(num_link) for record in num_link: try: writeline=record.get_attribute("href")+"\n" export.write(writeline) except : print "an unget link" href_flag=0 break except : print "record the error" error_record(input_thread, temp_date1, temp_date2, i, input_thread) export.close() e_email.send_mail("error in catching url!","an error happens") # driver.close() export.close() #------------------------------------------------------------------------------------------------------------------------------------------# # judge the time period start_point=1 temp_date1 = temp_date2+timedelta(days=1) temp_date2=temp_date1+ timedelta(days=10) if temp_date1 <= date_end and temp_date2 > date_end: temp_date2=date_end driver.close() driver.quit()
num_page=int(dict_error_info["num_page"]) catch_url(driver, url, err_start_date, err_end_date, error_end_date,num_page) # for a new starter total_days=date_end - date_start total_days=int(total_days.days)+1 print "total date range : ",date_start,"--", date_end print "total days: " + str(total_days) temp_date1=date_start temp_date2=temp_date1+ timedelta(days=10) if temp_date2 > date_end: temp_date2=date_end catch_url(driver, url, temp_date1, temp_date2, date_end) e_email.send_mail("finish and succeed", "The program is ok for you") # ##------------------------------------------------------------------------------ ## scrapy the info ##------------------------------------------------------------------------------ #import glob #read_file_list = glob.glob('E:\\guoxuanma\\webspider\\url\\*.txt') #