def upload_or_link(tmpfile): """This is called from inside screenshot_url to avoid uploading duplicate files to S3 when the page has not changed.""" with file(tmpfile) as fil_ro: bytes = fil_ro.read() image_sha1 = hashlib.sha1(bytes).hexdigest() if previous and previous.image_sha1 == image_sha1: return (previous.image_sha1, previous.image_url) else: filename = "{hash}/{hash}_{timestamp}.png".format( hash=self.url_sha1, timestamp=abbrev_isoformat(now)) new_url = upload_image(tmpfile, filename, 'image/png') if new_url is None: return None return (image_sha1, new_url)
def upload_or_link(tmpfile): """This is called from inside screenshot_url to avoid uploading duplicate files to S3 when the page has not changed.""" with file(tmpfile) as fil_ro: bytes = fil_ro.read() image_sha1 = hashlib.sha1(bytes).hexdigest() if previous and previous.image_sha1 == image_sha1: return (previous.image_sha1, previous.image_url) else: filename = "{hash}/{hash}_{timestamp}.png".format(hash=self.url_sha1, timestamp=abbrev_isoformat(now)) new_url = upload_image(tmpfile, filename, 'image/png') if new_url is None: return None return (image_sha1, new_url)
def relative_mirror_url(mirror): tz = pytz.timezone(settings.TIME_ZONE) timestamp = abbrev_isoformat(mirror.timestamp.astimezone(tz)) print timestamp parsed = urlparse.urlparse(mirror.election_url.url) if parsed.path == '/': fixedpath = "index.html" else: fixedpath = parsed.path.strip('/') path = '{state}/{sha1}/{timestamp}/{netloc}/{path}'.format(state=mirror.election_url.state, sha1=mirror.election_url.url_sha1, timestamp=timestamp, netloc=parsed.netloc, path=fixedpath) if parsed.query: path = path + '%3F' + parsed.query return path
def relative_mirror_url(mirror): tz = pytz.timezone(settings.TIME_ZONE) timestamp = abbrev_isoformat(mirror.timestamp.astimezone(tz)) print timestamp parsed = urlparse.urlparse(mirror.election_url.url) if parsed.path == '/': fixedpath = "index.html" else: fixedpath = parsed.path.strip('/') path = '{state}/{sha1}/{timestamp}/{netloc}/{path}'.format( state=mirror.election_url.state, sha1=mirror.election_url.url_sha1, timestamp=timestamp, netloc=parsed.netloc, path=fixedpath) if parsed.query: path = path + '%3F' + parsed.query return path
def mirror_url(urlobj): local_timezone = pytz.timezone(settings.TIME_ZONE) now = pytz.datetime.datetime.now(tz=local_timezone) url_mirror_root = os.path.abspath(os.path.join(settings.MIRROR_ROOT, urlobj.state, urlobj.url_sha1)) if not os.path.exists(url_mirror_root): os.makedirs(url_mirror_root) dest_dir = os.path.join(url_mirror_root, abbrev_isoformat(now)) log_path = os.path.join(url_mirror_root, "wget.log") previous = urlobj.latest_mirror() if previous: copy_dir(previous.dir, dest_dir) elif not os.path.exists(dest_dir): os.makedirs(dest_dir) user_agent_arg = ("--user-agent='{ua}'".format(ua=urlobj.user_agent) if urlobj.user_agent else "") args = ["wget", "--no-verbose", "-p", "--convert-links", "--wait=1", "-N", "--random-wait", user_agent_arg, "-o", log_path, "-P", dest_dir, urlobj.url] try: (stdout, stderr) = run_subprocess_safely(args) mirror = ElectionMirror.objects.create(election_url=urlobj, timestamp=now, dir=dest_dir) return mirror except ProcessTimeout as e: log.error(u"wget failed to mirror url {url}: {e} {e_type}", url=urlobj.url, e=unicode(e), e_type=type(e))
def mirror_url(urlobj): local_timezone = pytz.timezone(settings.TIME_ZONE) now = pytz.datetime.datetime.now(tz=local_timezone) url_mirror_root = os.path.abspath( os.path.join(settings.MIRROR_ROOT, urlobj.state, urlobj.url_sha1)) if not os.path.exists(url_mirror_root): os.makedirs(url_mirror_root) dest_dir = os.path.join(url_mirror_root, abbrev_isoformat(now)) log_path = os.path.join(url_mirror_root, "wget.log") previous = urlobj.latest_mirror() if previous: copy_dir(previous.dir, dest_dir) elif not os.path.exists(dest_dir): os.makedirs(dest_dir) user_agent_arg = ("--user-agent='{ua}'".format( ua=urlobj.user_agent) if urlobj.user_agent else "") args = [ "wget", "--no-verbose", "-p", "--convert-links", "--wait=1", "-N", "--random-wait", user_agent_arg, "-o", log_path, "-P", dest_dir, urlobj.url ] try: (stdout, stderr) = run_subprocess_safely(args) mirror = ElectionMirror.objects.create(election_url=urlobj, timestamp=now, dir=dest_dir) return mirror except ProcessTimeout as e: log.error(u"wget failed to mirror url {url}: {e} {e_type}", url=urlobj.url, e=unicode(e), e_type=type(e))