def process_body(self, body, item): data = json.loads(body) if "upload_target" in data: files = realize(self.files, item) inner_task = None if re.match(r"^rsync://", data["upload_target"]): item.log_output("Uploading with Rsync to %s" % data["upload_target"]) inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1) elif re.match(r"^https?://", data["upload_target"]): item.log_output("Uploading with Curl to %s" % data["upload_target"]) if len(files) != 1: item.log_output("Curl expects to upload a single file.") self.fail_item(item) return inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1) else: item.log_output("Received invalid upload type.") self.fail_item(item) return inner_task.on_complete_item += self._inner_task_complete_item inner_task.on_fail_item += self._inner_task_fail_item inner_task.enqueue(item) else: item.log_output("Tracker did not provide an upload target.") self.schedule_retry(item)
def handle_process_error(self, exit_code, item): if item["current_is_torrent"] == True: # Torrent doesn't exist, so there's no point in trying to download other pages item.log_output("Got throttled on ID %s (or server died)" % item["current_id"]) retry_delay = self.retry_delay # fall through to retry else: retry_delay = self.retry_delay item["tries"] += 1 item.log_output("Process %s returned exit code %d for %s\n" % (self, exit_code, item.description())) item.log_error(self, exit_code) if (self.max_tries == None or item["tries"] < self.max_tries) and ( self.retry_on_exit_code == None or exit_code in self.retry_on_exit_code): item.log_output("Retrying %s for %s after %d seconds...\n" % (self, item.description(), retry_delay)) IOLoop.instance().add_timeout( datetime.timedelta(seconds=retry_delay), functools.partial(self.process_one, item)) else: item.log_output("Failed %s for %s\n" % (self, item.description())) item["logwriter"].close() self.fail_item(item)
def process(self, item): for id_ in xrange(item["start_id"], item["end_id"] + 1): torrent_base, warc_base = item["file_bases"][id_] from_ = "%s/%s" % (item["item_dir"], torrent_base) to_ = "%s/%s" % (item["data_dir"], torrent_base) item.log_output("Moving file from %s to %s" % (from_, to_)) os.rename(from_, to_) from_ = "%s/%s" % (item["item_dir"], warc_base) to_ = "%s/%s" % (item["data_dir"], warc_base) item.log_output("Moving file from %s to %s" % (from_, to_)) os.rename(from_, to_) shutil.rmtree("%(item_dir)s" % item)
def process_body(self, body, item): data = json.loads(body) if "upload_target" in data: files = realize(self.files, item) inner_task = None if re.match(r"^rsync://", data["upload_target"]): item.log_output("Uploading with Rsync to %s" % data["upload_target"]) inner_task = RsyncUpload( data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1) elif re.match(r"^https?://", data["upload_target"]): item.log_output("Uploading with Curl to %s" % data["upload_target"]) if len(files) != 1: item.log_output("Curl expects to upload a single file.") self.fail_item(item) return inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1) else: item.log_output("Received invalid upload type.") self.fail_item(item) return inner_task.on_complete_item += self._inner_task_complete_item inner_task.on_fail_item += self._inner_task_fail_item inner_task.enqueue(item) else: item.log_output("Tracker did not provide an upload target.") self.schedule_retry(item)
def process(self, item): item_name = item["item_name"] assert "-" in item_name start, end = [int(x) for x in item_name.split("-", 1)] item["start_id"] = start item["end_id"] = end item["file_bases"] = {} dirname = "/".join((item["data_dir"], item_name)) if os.path.isdir(dirname): shutil.rmtree(dirname) item.log_output("Creating directory %s" % dirname) os.makedirs(dirname) item["item_dir"] = dirname for id_ in xrange(start, end + 1): file_base = "%s-%s-%s" % (self.file_prefix, time.strftime("%Y%m%d-%H%M%S"), id_,) torrent_base = "%s.torrent" % file_base warc_base = "%s.warc.gz" % file_base item["file_bases"][id_] = (torrent_base, warc_base) item.log_output("Creating file %s" % "%s/%s" % (dirname, torrent_base)) open("%s/%s" % (dirname, torrent_base), "w").close() item.log_output("Creating file %s" % "%s/%s" % (dirname, warc_base)) open("%s/%s" % (dirname, warc_base), "w").close()
def handle_process_result(self, exit_code, item): if item["torrent_404"]: item.log_output( "404 for torrent file detected, skipping ID %s..." % item["current_id"]) item["torrent_404"] = False if self.set_next_url(item): self.process_one(item) else: self.handle_done(item) return else: if item["current_is_torrent"]: item.log_output( "Found torrent for ID %s, fetching metadata..." % item["current_id"]) else: item.log_output( "Metadata for ID %s fetched. Moving on to next ID..." % item["current_id"]) item["current_is_torrent"] = not item["current_is_torrent"] if self.set_next_url(item): self.process_one(item) else: self.handle_done(item)
def process(self, item): item_name = item["item_name"] assert "-" in item_name start, end = [int(x) for x in item_name.split("-", 1)] item["start_id"] = start item["end_id"] = end item["file_bases"] = {} dirname = "/".join((item["data_dir"], item_name)) if os.path.isdir(dirname): shutil.rmtree(dirname) item.log_output("Creating directory %s" % dirname) os.makedirs(dirname) item["item_dir"] = dirname for id_ in xrange(start, end + 1): file_base = "%s-%s-%s" % ( self.file_prefix, time.strftime("%Y%m%d-%H%M%S"), id_, ) torrent_base = "%s.torrent" % file_base warc_base = "%s.warc.gz" % file_base item["file_bases"][id_] = (torrent_base, warc_base) item.log_output("Creating file %s" % "%s/%s" % (dirname, torrent_base)) open("%s/%s" % (dirname, torrent_base), "w").close() item.log_output("Creating file %s" % "%s/%s" % (dirname, warc_base)) open("%s/%s" % (dirname, warc_base), "w").close()
def process_one(self, item): with self.task_cwd(): url = item["current_url"] torrent_name, warc_name = item["file_bases"][item["current_id"]] item.log_output("Start downloading URL %s" % url) if item["current_is_torrent"]: extra_args = ["--output-document"] # F**k it, tired of wget try: data = urllib2.urlopen(url).read() if item["current_is_torrent"] and "Torrent not available." in data: item["torrent_404"] = True retcode = 0#8 else: retcode = 0 except urllib2.HTTPError, e: retcode = 8 self.on_subprocess_end(item, retcode) return else:
def process_one(self, item): with self.task_cwd(): url = item["current_url"] torrent_name, warc_name = item["file_bases"][item["current_id"]] item.log_output("Start downloading URL %s" % url) if item["current_is_torrent"]: extra_args = ["--output-document"] # F**k it, tired of wget try: data = urllib2.urlopen(url).read() if item["current_is_torrent"] and "Torrent not available." in data: item["torrent_404"] = True retcode = 0 #8 else: retcode = 0 except urllib2.HTTPError, e: retcode = 8 self.on_subprocess_end(item, retcode) return else:
def handle_process_error(self, exit_code, item): if item["current_is_torrent"] == True: # Torrent doesn't exist, so there's no point in trying to download other pages item.log_output("Got throttled on ID %s (or server died)" % item["current_id"]) retry_delay = self.retry_delay # fall through to retry else: retry_delay = self.retry_delay item["tries"] += 1 item.log_output("Process %s returned exit code %d for %s\n" % (self, exit_code, item.description())) item.log_error(self, exit_code) if (self.max_tries == None or item["tries"] < self.max_tries) and (self.retry_on_exit_code == None or exit_code in self.retry_on_exit_code): item.log_output("Retrying %s for %s after %d seconds...\n" % (self, item.description(), retry_delay)) IOLoop.instance().add_timeout(datetime.timedelta(seconds=retry_delay), functools.partial(self.process_one, item)) else: item.log_output("Failed %s for %s\n" % (self, item.description())) item["logwriter"].close() self.fail_item(item)
def handle_process_result(self, exit_code, item): if item["torrent_404"]: item.log_output("404 for torrent file detected, skipping ID %s..." % item["current_id"]) item["torrent_404"] = False if self.set_next_url(item): self.process_one(item) else: self.handle_done(item) return else: if item["current_is_torrent"]: item.log_output("Found torrent for ID %s, fetching metadata..." % item["current_id"]) else: item.log_output("Metadata for ID %s fetched. Moving on to next ID..." % item["current_id"]) item["current_is_torrent"] = not item["current_is_torrent"] if self.set_next_url(item): self.process_one(item) else: self.handle_done(item)
def handle_done(self, item): item.log_output("Finished %s for %s\n" % (self, item.description())) item["logwriter"].close() self.complete_item(item)