def find_link(region, zone, url_dir): if os.path.isfile(url_dir): if url_dir.endswith('Top_URL.txt'): print url_dir #找到链接文件中的每个链接,并以此为种子地址,自动开始下载 for line in open(url_dir): try: line_content = line.strip('\n') rank, userUrl = line_content.split('\t') have_a_rest("23:00:00", "06:00:00") print "Zone " + zone + ":No.", rank, " website is downloading: ", userUrl #result=''' dir_name = "more_sites" save_path = web4.makeDIR(region, zone) save_path = web4.makeDIR(save_path, dir_name) #设定网站爬取的深度,排名越靠前,爬取的深度越深 rank = int(rank) depth = 1 dir_path, myUrl, filestamp = web4.saveHTM( region, zone, save_path, userUrl, rank, depth) #储存网页 if dir_path != 'Error': #爬取成功!延时1秒之后继续爬取 print "please wait for 1 seconds" time.sleep(1) else: print dir_path, myUrl, filestamp #''' except Exception, e: print 'Error', str(e) #创建错误日志路径 error_path = web4.makeDIR(region, zone) error_path = web4.makeDIR(error_path, "url") error_path = os.path.join(error_path, 'get_start_site_ErrorLog.txt') ErrorLog = open(error_path, 'a') #追加出现的各种错误 ErrorLog.write(line_content + '\tError:\t' + str(e)) ErrorLog.close()
def move_url_file(filename): #还原URL文件路径 region, zone, timestamp, depth, postex = filename.split('_', 4) url_file_name = filename + ".txt" url_path = os.path.join(region, zone) url_path = os.path.join(url_path, "more_urls") dateDir = timestamp[0:8] url_path = os.path.join(url_path, dateDir) src_path = os.path.join(url_path, url_file_name) dst_path = web4.makeDIR(url_path, "done") dst_path = os.path.join(dst_path, url_file_name) shutil.move(src_path, dst_path) print "Now put file " + filename + " into Done directory..."
def find_link(region, url_dir, timestamp, sec): #遍历链接文件所在目录,逐个寻找链接文件并访问 finish = "yes" #下载链接的完成标志,初始默认为yes #创建错误日志路径 error_path = web4.makeDIR(url_dir, "err_log") error_path = os.path.join(error_path, 'file_web_ErrorLog.txt') ErrorLog = open(error_path, 'a') #追加记录爬取中的各种错误 for s in os.listdir(url_dir): thisDir = os.path.join(url_dir, s) if "done" in thisDir or "err_log" in thisDir: #如果遍历遇到err_log目录或done目录则略过 continue elif thisDir.endswith('URL.txt'): #如果碰到链接文件,则读取并下载里面的链接 print thisDir finish = "no" path, filename = os.path.split(thisDir) #分离路径和文件名 filename, filetype = os.path.splitext(filename) #分离文件名和扩展名 #找到链接文件中的每个链接,并以此为种子地址,自动开始下载 for line in open(thisDir): try: line = line.strip('\n') #print line+"," userUrl, fromUrl, rank = line.split('\t', 2) if not userUrl.startswith('http'): continue have_a_rest("23:00:00", "06:00:00") #夜间停止下载 get(filename, userUrl, rank, sec) except Exception, e: print 'Error', str(e) ErrorLog.write(line + '\t' + 'Error:' + '\t' + str(e)) print "Links in file " + filename + " are done!" #将已遍历完的URL文件放进同一层的done目录下 move_url_file(filename)
if not userUrl.startswith('http'): continue have_a_rest("23:00:00", "06:00:00") #夜间停止下载 get(filename, userUrl, rank, sec) except Exception, e: print 'Error', str(e) ErrorLog.write(line + '\t' + 'Error:' + '\t' + str(e)) print "Links in file " + filename + " are done!" #将已遍历完的URL文件放进同一层的done目录下 move_url_file(filename) ErrorLog.close() return finish #print now_time if __name__ == '__main__': while True: region = raw_input("请输入爬虫分布的区域:") zone = raw_input("请输入爬虫工作的目录:") timestamp = raw_input("please input the date name( 20170810 e.g)") sec = raw_input("please input the time between 2 visits:") url_path = web4.makeDIR(region, zone) url_path = web4.makeDIR(url_path, "more_urls") #在more_url子目录下进行寻找 url_path = web4.makeDIR(url_path, timestamp) #遍历链接文件所在目录,逐个寻找链接文件并访问 find_link(region, url_path, timestamp, sec) print "所有网站处理完毕"
------------------------------------------------------------------------------- """ #---------------------------------import--------------------------------------- import os import shutil import urllib2 import re from BeautifulSoup import BeautifulSoup import get_web_list import web4 import thisLink3_0 #------------------------------------------------------------------------------ ############################################################################### if __name__ == "__main__": #1.下载排行榜网页 region = raw_input("请输入爬虫分布的领域(mil,sports...):") size = raw_input("请输入该领域内每个区域的大小(以页数为单位):") src_dir = web4.makeDIR(region, "url") get_web_list.find_link(region, src_dir) print "The ChinaZ site already done!" #2.提取排行榜上的网站链接 site_path = web4.makeDIR(region, "site") #新建保存目录 thisLink3_0.find_link(region, site_path, size) print "该领域的所有分区已建立,请启动main.py脚本开始多进程下载!"
import thisLink2_1 import file_web2_1 #------------------------------------------------------------------------------ ############################################################################### if __name__ == "__main__": region = raw_input("请输入爬虫分布的区域:") zone = raw_input("请输入爬虫工作的目录:") #zone为同一个region下面的分区,方便开启多进程同时下载 timestamp = raw_input( "Please input the timestamp(20170809 eg.) you want to do:") sec = raw_input("please input the time between 2 crawling:") root_url_dir = web4.makeDIR(region, zone) root_url_dir = web4.makeDIR(root_url_dir, "url") #遍历链接文件所在目录,寻找链接文件并访问 get_start_site.find_link(region, zone, root_url_dir) print "All seed webpages are done ! Now begin to download other webpages...s" while True: #根据database提取链接,database里面的timestamp和后面file_web2.1的timestamp默认为 #一个 over = thisLink2_1.find_file(region, zone, timestamp) print over #根据URL文件下载网页 #result=''' print "Now start the new round of crawling..."
print dir_path, myUrl, filestamp #''' except Exception, e: print 'Error', str(e) #创建错误日志路径 error_path = web4.makeDIR(region, zone) error_path = web4.makeDIR(error_path, "url") error_path = os.path.join(error_path, 'get_start_site_ErrorLog.txt') ErrorLog = open(error_path, 'a') #追加出现的各种错误 ErrorLog.write(line_content + '\tError:\t' + str(e)) ErrorLog.close() elif os.path.isdir(url_dir): for s in os.listdir(url_dir): newDir = os.path.join(url_dir, s) find_link(region, zone, newDir) #print now_time if __name__ == '__main__': while True: region = raw_input("请输入爬虫工作区域:") zone = raw_input("请输入爬虫工作分区:") #zone为同一个region下面的分区,方便开启多进程同时下载 url_dir = web4.makeDIR(region, zone) url_dir = web4.makeDIR(url_dir, "url") #遍历链接文件所在目录,逐个寻找链接文件并访问 find_link(region, zone, url_dir) print "所有网站处理完毕"