def check_dir(dir_name, logfile_name): os.chdir(dir_name) try: f = open('index.log') except IOError: print "can't open index.log in %s\n" % dir_name sys.exit(1) ret = True gl = trunk.GetFileLine(f, 200) firstline, secondline = gl.get_first(2) lastlines = gl.get_last(10) if lastlines[0] == 'Success': print "%s success" % dir_name if clf and not clf.has_download(firstline): clf.write([firstline, secondline, dir_name]) clf.write("\n") else: print "last line of index.log in %s is:" % dir_name print `lastlines[0]` logfile = sys.stdout if logfile_name: logfile = open(logfile_name, 'w+') if lastlines[0].startswith('Refresh this page'): url_head = url_head_pattern.search(lastlines[0]).group() ret = hash_download(url_head, lastlines[1:], logfile) elif lastlines[0] == 'Checked fail: Refresh this page' or \ lastlines[0] == "Download retry Failed" or \ lastlines[0].startswith('No such file'): retry_hash_download(gl, logfile=logfile) elif lastlines[0].startswith('Error: open page '): ret = check_res(trunk.crawl_subject(firstline, logfile=logfile)) elif lastlines[0].startswith('Error: not find dowload path in'): ret = check_res(trunk.crawl_subject(firstline, 0, logfile)) else: if dir_name in failed_dict: print '%s has checked failed, info:\n%s' % (dir_name, failed_dict[dir_name]) choice = raw_input('expect lastline or retry?[y/r/n]') if choice.startswith('r'): retry_hash_download(gl, logfile=logfile) ret = choice.startswith('y') f.close() os.chdir('..') return ret
for id in xrange(beginID, endID): sub_url = 'htm_data/2/1602/1%d.html' % id url = trunk.domain + sub_url content = trunk.open_page(url, 4) if not content: print '%s open failed\n' % sub_url continue soup = BeautifulSoup(content, from_encoding='gbk') title = unicode(soup.title.string) title_end_pos = title.find(title_end) title = title[:title_end_pos] encode_title = str(title.encode('gb18030')) now = str(time.time()) os.mkdir(now) os.chdir(now) logfile = open('index.log', 'w+') logfile.write("%s\n" % sub_url) logfile.write("%s\n" % encode_title) logfile.write("\n") res_tuple = trunk.crawl_subject(sub_url, logfile=logfile) if res_tuple[0]: clf.write([sub_url, encode_title, now]) else: clf.write([res_tuple[1], sub_url, encode_title, now]) clf.write("\n") logfile.close() os.chdir('..') clf.flush() clf.close()