def download(self): row = self.row if not os.path.exists(row[SAVEDIR_COL]): os.makedirs(row[SAVEDIR_COL], exist_ok=True) filepath, tmp_filepath, conf_filepath = get_tmp_filepath( row[SAVEDIR_COL], row[SAVENAME_COL]) if os.path.exists(filepath): if self.download_mode == DownloadMode.IGNORE: self.emit('downloaded', row[FSID_COL]) logger.debug('File exists, ignored!') return elif self.download_mode == DownloadMode.NEWCOPY: name, ext = os.path.splitext(filepath) filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext) url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL]) if not url: row[STATE_COL] = State.ERROR self.emit('network-error', row[FSID_COL]) logger.warn('Failed to get url to download') return if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath): with open(conf_filepath) as conf_fh: status = json.load(conf_fh) threads = len(status) file_exists = True fh = open(tmp_filepath, 'rb+') fh.seek(0) else: req = net.urlopen_simple(url) if not req: logger.warn('Failed to get url to download') self.emit('network-error', row[FSID_COL]) return content_length = req.getheader('Content-Length') # Fixed: baiduPCS using non iso-8859-1 codec in http headers if not content_length: match = re.search('\sContent-Length:\s*(\d+)', str(req.headers)) if not match: logger.warn('Failed to get url to download') self.emit('network-error', row[FSID_COL]) return content_length = match.group(1) size = int(content_length) if size <= SMALL_FILE_SIZE: threads = 1 else: threads = self.default_threads average_size, pad_size = divmod(size, threads) file_exists = False status = [] fh = open(tmp_filepath, 'wb') try: fh.truncate(size) except (OSError, IOError): e = truncate.format_exc() logger.error(e) self.emit('disk-error', row[FSID_COL], tmp_filepath) return # task list tasks = [] # message queue queue = Queue() # threads lock lock = threading.RLock() for id_ in range(threads): if file_exists: start_size, end_size, received = status[id_] if start_size + received >= end_size: # part of file has been downloaded continue start_size += received else: start_size = id_ * average_size end_size = start_size + average_size - 1 if id_ == threads - 1: end_size = end_size + pad_size + 1 status.append([start_size, end_size, 0]) task = DownloadBatch(id_, queue, url, lock, start_size, end_size, fh, self.timeout) tasks.append(task) for task in tasks: task.start() try: conf_count = 0 done = 0 self.emit('started', row[FSID_COL]) while row[STATE_COL] == State.DOWNLOADING: id_, received = queue.get() # FINISHED if received == BATCH_FINISISHED: done += 1 if done == len(tasks): row[STATE_COL] = State.FINISHED break else: continue # error occurs elif received == BATCH_ERROR: row[STATE_COL] = State.ERROR break status[id_][2] += received conf_count += 1 # flush data and status to disk if conf_count > THRESHOLD_TO_FLUSH: with lock: if not fh.closed: fh.flush() with open(conf_filepath, 'w') as fh: json.dump(status, fh) conf_count = 0 received_total = sum(t[2] for t in status) self.emit('received', row[FSID_COL], received, received_total) except Exception: logger.error(traceback.format_exc()) row[STATE_COL] = State.ERROR with lock: if not fh.closed: fh.close() for task in tasks: if task.isAlive(): task.stop() with open(conf_filepath, 'w') as fh: json.dump(status, fh) if row[STATE_COL] == State.CANCELED: os.remove(tmp_filepath) if os.path.exists(conf_filepath): os.remove(conf_filepath) elif row[STATE_COL] == State.ERROR: self.emit('network-error', row[FSID_COL]) elif row[STATE_COL] == State.FINISHED: self.emit('downloaded', row[FSID_COL]) os.rename(tmp_filepath, filepath) if os.path.exists(conf_filepath): os.remove(conf_filepath)
def download(self): row = self.row if not os.path.exists(row[SAVEDIR_COL]): os.makedirs(row[SAVEDIR_COL], exist_ok=True) filepath, tmp_filepath, conf_filepath = get_tmp_filepath( row[SAVEDIR_COL], row[SAVENAME_COL]) if os.path.exists(filepath): if self.download_mode == DownloadMode.IGNORE: self.emit('downloaded', row[FSID_COL]) return elif self.download_mode == DownloadMode.NEWCOPY: name, ext = os.path.splitext(filepath) filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext) url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL]) if not url: print('Error: Failed to get download link') row[STATE_COL] = State.ERROR self.emit('network-error', row[FSID_COL]) return if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath): with open(conf_filepath) as conf_fh: status = json.load(conf_fh) threads = len(status) file_exists = True fh = open(tmp_filepath, 'ab') else: req = request.urlopen(url) if not req: self.emit('network-error', row[FSID_COL]) return content_length = req.getheader('Content-Length') # Fixed: baiduPCS using non iso-8859-1 codec in http headers if not content_length: match = re.search('\sContent-Length:\s*(\d+)', str(req.headers)) if not match: self.emit('network-error', row[FSID_COL]) return content_length = match.group(1) size = int(content_length) threads = self.default_threads average_size, pad_size = divmod(size, threads) file_exists = False status = [] fh = open(tmp_filepath, 'wb') fh.truncate(size) # task list tasks = [] # message queue queue = Queue() # threads lock lock = threading.RLock() for id_ in range(threads): if file_exists: start_size, end_size, received = status[id_] start_size += received else: start_size = id_ * average_size end_size = start_size + average_size - 1 if id_ == threads - 1: end_size = end_size + pad_size + 1 status.append([start_size, end_size, 0]) task = DownloadBatch(id_, queue, url, lock, start_size, end_size, fh, self.timeout) tasks.append(task) for task in tasks: task.start() try: conf_count = 0 done = 0 self.emit('started', row[FSID_COL]) while row[STATE_COL] == State.DOWNLOADING: id_, received = queue.get() # FINISHED if received == BATCH_FINISISHED: done += 1 if done == len(status): row[STATE_COL] = State.FINISHED break else: continue elif received == BATCH_ERROR: row[STATE_COL] = State.ERROR break status[id_][2] += received conf_count += 1 if conf_count > THRESHOLD_TO_FLUSH: with open(conf_filepath, 'w') as fh: fh.write(json.dumps(status)) conf_count = 0 received_total = sum(t[2] for t in status) self.emit('received', row[FSID_COL], received_total) except Exception as e: print(e) for task in tasks: task.stop() row[STATE_COL] = State.ERROR fh.close() with open(conf_filepath, 'w') as fh: fh.write(json.dumps(status)) for task in tasks: if not task.isAlive(): task.stop() if row[STATE_COL] == State.CANCELED: os.remove(tmp_filepah) if os.path.exists(conf_filepath): os.remove(conf_filepath) elif row[STATE_COL] == State.FINISHED: self.emit('downloaded', row[FSID_COL]) os.rename(tmp_filepath, filepath) if os.path.exists(conf_filepath): os.remove(conf_filepath)
def download(self): row = self.row if not os.path.exists(row[SAVEDIR_COL]): os.makedirs(row[SAVEDIR_COL], exist_ok=True) filepath, tmp_filepath, conf_filepath = get_tmp_filepath( row[SAVEDIR_COL], row[SAVENAME_COL]) if os.path.exists(filepath): if self.download_mode == DownloadMode.IGNORE: self.emit('downloaded', row[FSID_COL]) logger.debug('File exists, ignored!') return elif self.download_mode == DownloadMode.NEWCOPY: name, ext = os.path.splitext(filepath) filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext) url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL]) if not url: row[STATE_COL] = State.ERROR self.emit('network-error', row[FSID_COL]) logger.warn('Failed to get url to download') return if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath): with open(conf_filepath) as conf_fh: status = json.load(conf_fh) threads = len(status) file_exists = True fh = open(tmp_filepath, 'rb+') fh.seek(0) else: req = net.urlopen_simple(url) if not req: logger.warn('Failed to get url to download') self.emit('network-error', row[FSID_COL]) return content_length = req.getheader('Content-Length') # Fixed: baiduPCS using non iso-8859-1 codec in http headers if not content_length: match = re.search('\sContent-Length:\s*(\d+)', str(req.headers)) if not match: logger.warn('Failed to get url to download') self.emit('network-error', row[FSID_COL]) return content_length = match.group(1) size = int(content_length) if size == 0: open(filepath, 'a').close() self.emit('downloaded', row[FSID_COL]) return elif size <= SMALL_FILE_SIZE: threads = 1 else: threads = self.default_threads average_size, pad_size = divmod(size, threads) file_exists = False status = [] fh = open(tmp_filepath, 'wb') try: fh.truncate(size) except (OSError, IOError): e = truncate.format_exc() logger.error(e) self.emit('disk-error', row[FSID_COL], tmp_filepath) return # task list tasks = [] # message queue queue = Queue() # threads lock lock = threading.RLock() for id_ in range(threads): if file_exists: start_size, end_size, received = status[id_] if start_size + received >= end_size: # part of file has been downloaded continue start_size += received else: start_size = id_ * average_size end_size = start_size + average_size - 1 if id_ == threads - 1: end_size = end_size + pad_size + 1 status.append([start_size, end_size, 0]) task = DownloadBatch(id_, queue, url, lock, start_size, end_size, fh, self.timeout) tasks.append(task) for task in tasks: task.start() try: conf_count = 0 done = 0 self.emit('started', row[FSID_COL]) while row[STATE_COL] == State.DOWNLOADING: id_, received = queue.get() # FINISHED if received == BATCH_FINISISHED: done += 1 if done == len(tasks): row[STATE_COL] = State.FINISHED break else: continue # error occurs elif received == BATCH_ERROR: row[STATE_COL] = State.ERROR break status[id_][2] += received conf_count += 1 # flush data and status to disk if conf_count > THRESHOLD_TO_FLUSH: with lock: if not fh.closed: fh.flush() with open(conf_filepath, 'w') as fh: json.dump(status, fh) conf_count = 0 received_total = sum(t[2] for t in status) self.emit('received', row[FSID_COL], received, received_total) except Exception: logger.error(traceback.format_exc()) row[STATE_COL] = State.ERROR with lock: if not fh.closed: fh.close() for task in tasks: if task.isAlive(): task.stop() with open(conf_filepath, 'w') as fh: json.dump(status, fh) if row[STATE_COL] == State.CANCELED: if os.path.exists(tmp_filepath): os.remove(tmp_filepath) if os.path.exists(conf_filepath): os.remove(conf_filepath) elif row[STATE_COL] == State.ERROR: self.emit('network-error', row[FSID_COL]) elif row[STATE_COL] == State.FINISHED: self.emit('downloaded', row[FSID_COL]) os.rename(tmp_filepath, filepath) if os.path.exists(conf_filepath): os.remove(conf_filepath)