Example #1
0
	def process_body(self, body, item):
		data = json.loads(body)
		if "upload_target" in data:
			files = realize(self.files, item)
			inner_task = None

			if re.match(r"^rsync://", data["upload_target"]):
				item.log_output("Uploading with Rsync to %s" % data["upload_target"])
				inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1)

			elif re.match(r"^https?://", data["upload_target"]):
				item.log_output("Uploading with Curl to %s" % data["upload_target"])

				if len(files) != 1:
					item.log_output("Curl expects to upload a single file.")
					self.fail_item(item)
					return

				inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1)

			else:
				item.log_output("Received invalid upload type.")
				self.fail_item(item)
				return

			inner_task.on_complete_item += self._inner_task_complete_item
			inner_task.on_fail_item += self._inner_task_fail_item
			inner_task.enqueue(item)

		else:
			item.log_output("Tracker did not provide an upload target.")
			self.schedule_retry(item)
Example #2
0
    def handle_process_error(self, exit_code, item):
        if item["current_is_torrent"] == True:
            # Torrent doesn't exist, so there's no point in trying to download other pages
            item.log_output("Got throttled on ID %s (or server died)" %
                            item["current_id"])
            retry_delay = self.retry_delay
            # fall through to retry
        else:
            retry_delay = self.retry_delay

        item["tries"] += 1

        item.log_output("Process %s returned exit code %d for %s\n" %
                        (self, exit_code, item.description()))
        item.log_error(self, exit_code)

        if (self.max_tries == None or item["tries"] < self.max_tries) and (
                self.retry_on_exit_code == None
                or exit_code in self.retry_on_exit_code):
            item.log_output("Retrying %s for %s after %d seconds...\n" %
                            (self, item.description(), retry_delay))
            IOLoop.instance().add_timeout(
                datetime.timedelta(seconds=retry_delay),
                functools.partial(self.process_one, item))
        else:
            item.log_output("Failed %s for %s\n" % (self, item.description()))
            item["logwriter"].close()
            self.fail_item(item)
Example #3
0
	def process(self, item):
		for id_ in xrange(item["start_id"], item["end_id"] + 1):
			torrent_base, warc_base = item["file_bases"][id_]
			
			from_ = "%s/%s" % (item["item_dir"], torrent_base)
			to_ = "%s/%s" % (item["data_dir"], torrent_base)
			item.log_output("Moving file from %s to %s" % (from_, to_))
			os.rename(from_, to_)
			
			from_ = "%s/%s" % (item["item_dir"], warc_base)
			to_ = "%s/%s" % (item["data_dir"], warc_base)
			item.log_output("Moving file from %s to %s" % (from_, to_))
			os.rename(from_, to_)
			
		shutil.rmtree("%(item_dir)s" % item)
Example #4
0
    def process(self, item):
        for id_ in xrange(item["start_id"], item["end_id"] + 1):
            torrent_base, warc_base = item["file_bases"][id_]

            from_ = "%s/%s" % (item["item_dir"], torrent_base)
            to_ = "%s/%s" % (item["data_dir"], torrent_base)
            item.log_output("Moving file from %s to %s" % (from_, to_))
            os.rename(from_, to_)

            from_ = "%s/%s" % (item["item_dir"], warc_base)
            to_ = "%s/%s" % (item["data_dir"], warc_base)
            item.log_output("Moving file from %s to %s" % (from_, to_))
            os.rename(from_, to_)

        shutil.rmtree("%(item_dir)s" % item)
Example #5
0
    def process_body(self, body, item):
        data = json.loads(body)
        if "upload_target" in data:
            files = realize(self.files, item)
            inner_task = None

            if re.match(r"^rsync://", data["upload_target"]):
                item.log_output("Uploading with Rsync to %s" %
                                data["upload_target"])
                inner_task = RsyncUpload(
                    data["upload_target"],
                    files,
                    target_source_path=self.rsync_target_source_path,
                    bwlimit=self.rsync_bwlimit,
                    extra_args=self.rsync_extra_args,
                    max_tries=1)

            elif re.match(r"^https?://", data["upload_target"]):
                item.log_output("Uploading with Curl to %s" %
                                data["upload_target"])

                if len(files) != 1:
                    item.log_output("Curl expects to upload a single file.")
                    self.fail_item(item)
                    return

                inner_task = CurlUpload(data["upload_target"],
                                        files[0],
                                        self.curl_connect_timeout,
                                        self.curl_speed_limit,
                                        self.curl_speed_time,
                                        max_tries=1)

            else:
                item.log_output("Received invalid upload type.")
                self.fail_item(item)
                return

            inner_task.on_complete_item += self._inner_task_complete_item
            inner_task.on_fail_item += self._inner_task_fail_item
            inner_task.enqueue(item)

        else:
            item.log_output("Tracker did not provide an upload target.")
            self.schedule_retry(item)
Example #6
0
	def process(self, item):
		item_name = item["item_name"]
		
		assert "-" in item_name
		start, end = [int(x) for x in item_name.split("-", 1)]
		
		item["start_id"] = start
		item["end_id"] = end
		item["file_bases"] = {}
		
		dirname = "/".join((item["data_dir"], item_name))
		
		if os.path.isdir(dirname):
			shutil.rmtree(dirname)
			
		item.log_output("Creating directory %s" % dirname)
		os.makedirs(dirname)
		
		item["item_dir"] = dirname
		
		for id_ in xrange(start, end + 1):
			file_base = "%s-%s-%s" % (self.file_prefix, time.strftime("%Y%m%d-%H%M%S"), id_,)
			torrent_base = "%s.torrent" % file_base
			warc_base = "%s.warc.gz" % file_base
			item["file_bases"][id_] = (torrent_base, warc_base)
			item.log_output("Creating file %s" % "%s/%s" % (dirname, torrent_base))
			open("%s/%s" % (dirname, torrent_base), "w").close()
			item.log_output("Creating file %s" % "%s/%s" % (dirname, warc_base))
			open("%s/%s" % (dirname, warc_base), "w").close()
Example #7
0
 def handle_process_result(self, exit_code, item):
     if item["torrent_404"]:
         item.log_output(
             "404 for torrent file detected, skipping ID %s..." %
             item["current_id"])
         item["torrent_404"] = False
         if self.set_next_url(item):
             self.process_one(item)
         else:
             self.handle_done(item)
         return
     else:
         if item["current_is_torrent"]:
             item.log_output(
                 "Found torrent for ID %s, fetching metadata..." %
                 item["current_id"])
         else:
             item.log_output(
                 "Metadata for ID %s fetched. Moving on to next ID..." %
                 item["current_id"])
         item["current_is_torrent"] = not item["current_is_torrent"]
         if self.set_next_url(item):
             self.process_one(item)
         else:
             self.handle_done(item)
Example #8
0
    def process(self, item):
        item_name = item["item_name"]

        assert "-" in item_name
        start, end = [int(x) for x in item_name.split("-", 1)]

        item["start_id"] = start
        item["end_id"] = end
        item["file_bases"] = {}

        dirname = "/".join((item["data_dir"], item_name))

        if os.path.isdir(dirname):
            shutil.rmtree(dirname)

        item.log_output("Creating directory %s" % dirname)
        os.makedirs(dirname)

        item["item_dir"] = dirname

        for id_ in xrange(start, end + 1):
            file_base = "%s-%s-%s" % (
                self.file_prefix,
                time.strftime("%Y%m%d-%H%M%S"),
                id_,
            )
            torrent_base = "%s.torrent" % file_base
            warc_base = "%s.warc.gz" % file_base
            item["file_bases"][id_] = (torrent_base, warc_base)
            item.log_output("Creating file %s" % "%s/%s" %
                            (dirname, torrent_base))
            open("%s/%s" % (dirname, torrent_base), "w").close()
            item.log_output("Creating file %s" % "%s/%s" %
                            (dirname, warc_base))
            open("%s/%s" % (dirname, warc_base), "w").close()
Example #9
0
	def process_one(self, item):
		with self.task_cwd():
			url = item["current_url"]
			torrent_name, warc_name = item["file_bases"][item["current_id"]]
			
			item.log_output("Start downloading URL %s" % url)
			
			if item["current_is_torrent"]:
				extra_args = ["--output-document"]
				# F**k it, tired of wget
				try:
					data = urllib2.urlopen(url).read()
					if item["current_is_torrent"] and "Torrent not available." in data:
						item["torrent_404"] = True
						retcode = 0#8
					else:
						retcode = 0
				except urllib2.HTTPError, e:
					retcode = 8
				self.on_subprocess_end(item, retcode)
				return
			else:
Example #10
0
    def process_one(self, item):
        with self.task_cwd():
            url = item["current_url"]
            torrent_name, warc_name = item["file_bases"][item["current_id"]]

            item.log_output("Start downloading URL %s" % url)

            if item["current_is_torrent"]:
                extra_args = ["--output-document"]
                # F**k it, tired of wget
                try:
                    data = urllib2.urlopen(url).read()
                    if item["current_is_torrent"] and "Torrent not available." in data:
                        item["torrent_404"] = True
                        retcode = 0  #8
                    else:
                        retcode = 0
                except urllib2.HTTPError, e:
                    retcode = 8
                self.on_subprocess_end(item, retcode)
                return
            else:
Example #11
0
	def handle_process_error(self, exit_code, item):
		if item["current_is_torrent"] == True:
			# Torrent doesn't exist, so there's no point in trying to download other pages
			item.log_output("Got throttled on ID %s (or server died)" % item["current_id"])
			retry_delay = self.retry_delay
			# fall through to retry
		else:
			retry_delay = self.retry_delay
		
		item["tries"] += 1

		item.log_output("Process %s returned exit code %d for %s\n" % (self, exit_code, item.description()))
		item.log_error(self, exit_code)

		if (self.max_tries == None or item["tries"] < self.max_tries) and (self.retry_on_exit_code == None or exit_code in self.retry_on_exit_code):
			item.log_output("Retrying %s for %s after %d seconds...\n" % (self, item.description(), retry_delay))
			IOLoop.instance().add_timeout(datetime.timedelta(seconds=retry_delay),
				functools.partial(self.process_one, item))
		else:
			item.log_output("Failed %s for %s\n" % (self, item.description()))
			item["logwriter"].close()
			self.fail_item(item)
Example #12
0
	def handle_process_result(self, exit_code, item):
		if item["torrent_404"]:
			item.log_output("404 for torrent file detected, skipping ID %s..." % item["current_id"])
			item["torrent_404"] = False
			if self.set_next_url(item):
				self.process_one(item)
			else:
				self.handle_done(item)
			return
		else:
			if item["current_is_torrent"]:
				item.log_output("Found torrent for ID %s, fetching metadata..." % item["current_id"])
			else:
				item.log_output("Metadata for ID %s fetched. Moving on to next ID..." % item["current_id"])
			item["current_is_torrent"] = not item["current_is_torrent"]
			if self.set_next_url(item):
				self.process_one(item)
			else:
				self.handle_done(item)
Example #13
0
	def handle_done(self, item):
		item.log_output("Finished %s for %s\n" % (self, item.description()))
		item["logwriter"].close()
		self.complete_item(item)
Example #14
0
 def handle_done(self, item):
     item.log_output("Finished %s for %s\n" % (self, item.description()))
     item["logwriter"].close()
     self.complete_item(item)