Exemple #1
0
def catch_url(driver, url, temp_date1, temp_date2, date_end, start_point=1 ):
    while (temp_date2 <= date_end):
        
        data_confirm="n"
        day_interval=10
        while data_confirm != "y":
                   
            driver=makesure_page(driver,url,temp_date1,temp_date2)
            flag=0
            while flag==0:
                
                try: 
                    data_summary=driver.find_element_by_css_selector("td.pager").text
                    flag=1
                except :
                    print "error in finding element"
                    driver=makesure_page(driver,url,temp_date1,temp_date2)
                    time.sleep(5)
                
                if flag==1:
                    print "get the info"
                    break     
            
            print "#"+data_summary+"#"
            page_info=re.findall("[-+]?\d+[\.]?\d*", data_summary)
            num_page= page_info[0]
            total_record=page_info[-1]
            # check the number of pages 
            if int(num_page) <= 200:
                data_confirm ="y"
            else :
                
                temp_date2=temp_date2-timedelta(days=int(day_interval/2))
                if temp_date2<temp_date1 :
                    print "error, the information is too big, we need to start with day by day "
                    temp_date2=temp_date1+timedelta(days=1)
                
    #------------------------------------------------------------------------------------------------------------------------------------------#
    # next part is to scrap the url link 
    #------------------------------------------------------------------------------------------------------------------------------------------#
        print "current date range is :",str(temp_date1)+"---"+str(temp_date2)
        data_pages=int(data_summary[data_summary.index(u'共')+1:data_summary.index(u'页')])
        file_name =temp_date1.strftime("%Y-X%m-X%d").replace('X0','X').replace('X','')
        if start_point== 1 :
            write_format='w'
        else :
            write_format='a+'


        export =codecs.open(".\\url\\"+file_name+".txt",write_format,"utf-8")

        
            # log record
        try:  
            for i in range(start_point,data_pages+1):
                href_flag=0
                while href_flag==0: 
                    num_link=execute_link(driver, i, data_summary,url,temp_date1,temp_date2)
                    href_flag=1
                    print i,len(num_link)
                    for record in num_link:
                        try:
                            writeline=record.get_attribute("href")+"\n"
                            export.write(writeline)
                        except :
                            print "an unget link"
                            href_flag=0
                            break
        except :
            print "record the error"
            error_record(input_thread, temp_date1, temp_date2, i, input_thread)
            export.close()
            e_email.send_mail("error in catching url!","an error happens")
                # driver.close()

            

        export.close()  
        
    #------------------------------------------------------------------------------------------------------------------------------------------#    
        # judge the time period
        start_point=1
        temp_date1 = temp_date2+timedelta(days=1)
        temp_date2=temp_date1+ timedelta(days=10)
        if temp_date1 <= date_end and temp_date2 > date_end:
            temp_date2=date_end
    
    
    driver.close()
    driver.quit()
Exemple #2
0
    num_page=int(dict_error_info["num_page"])
    catch_url(driver, url, err_start_date, err_end_date, error_end_date,num_page)

    # for a new starter 
    total_days=date_end - date_start
    total_days=int(total_days.days)+1
    print "total date range : ",date_start,"--", date_end
    print "total days: " + str(total_days)
    temp_date1=date_start
    temp_date2=temp_date1+ timedelta(days=10)
    if temp_date2 > date_end:
        temp_date2=date_end
    catch_url(driver, url, temp_date1, temp_date2, date_end)


e_email.send_mail("finish and succeed", "The program is ok for you")



        




#
##------------------------------------------------------------------------------
## scrapy the info
##------------------------------------------------------------------------------
#import glob
#read_file_list = glob.glob('E:\\guoxuanma\\webspider\\url\\*.txt')
#