def download_latest_hlebsol(): from pbs import wget, mv, rm, cp wget('-N', '-S', 'http://hleb-sol.biz/templates/1.xls') mv('1.xls', 'dinner/fixtures/hlebsol-current.xls') wget('-N', '-S', 'http://hleb-sol.biz/templates/2.xls') mv('2.xls', 'dinner/fixtures/hlebsol-next.xls') cp('dinner/fixtures/hlebsol-current.xls', 'dinner/fixtures/fusion-current.xls') cp('dinner/fixtures/hlebsol-next.xls', 'dinner/fixtures/fusion-next.xls')
def rip_site(domain_name, output_dir, timeout=120): user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.22) Gecko/20110905 Ubuntu/10.04 (lucid) Firefox/3.6.22" sh.wget("-U", user_agent, "--no-check-certificate", "--quiet", "-EHkp", "--read-timeout", timeout - 10, "-t", 2, "-nd", "-P", output_dir, domain_name, _ok_code=[3,0,8]) # default to index.html index = join(output_dir, "index.html") if exists(index): output_file = index else: # otherwise just use the biggest html file biggest_html_file = (None, 0) for f in os.listdir(output_dir): if f.endswith(".html"): size = os.stat(join(output_dir, f)).st_size if size > biggest_html_file[1]: biggest_html_file = (f, size) if biggest_html_file[0]: output_file = join(output_dir, biggest_html_file[0]) else: raise Exception, "no html file found in download" return output_file