Exemple #1
0
    def download(self):
        row = self.row
        if not os.path.exists(row[SAVEDIR_COL]):
            os.makedirs(row[SAVEDIR_COL], exist_ok=True)
        filepath, tmp_filepath, conf_filepath = get_tmp_filepath(
                row[SAVEDIR_COL], row[SAVENAME_COL]) 

        if os.path.exists(filepath):
            if self.download_mode == DownloadMode.IGNORE:
                self.emit('downloaded', row[FSID_COL])
                logger.debug('File exists, ignored!')
                return
            elif self.download_mode == DownloadMode.NEWCOPY:
                name, ext = os.path.splitext(filepath)
                filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext)

        url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL])
        if not url:
            row[STATE_COL] = State.ERROR
            self.emit('network-error', row[FSID_COL])
            logger.warn('Failed to get url to download')
            return

        if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath):
            with open(conf_filepath) as conf_fh:
                status = json.load(conf_fh)
            threads = len(status)
            file_exists = True
            fh = open(tmp_filepath, 'rb+')
            fh.seek(0)
        else:
            req = net.urlopen_simple(url)
            if not req:
                logger.warn('Failed to get url to download')
                self.emit('network-error', row[FSID_COL])
                return
            content_length = req.getheader('Content-Length')
            # Fixed: baiduPCS using non iso-8859-1 codec in http headers
            if not content_length:
                match = re.search('\sContent-Length:\s*(\d+)', str(req.headers))
                if not match:
                    logger.warn('Failed to get url to download')
                    self.emit('network-error', row[FSID_COL])
                    return
                content_length = match.group(1)
            size = int(content_length)
            if size <= SMALL_FILE_SIZE:
                threads = 1
            else:
                threads = self.default_threads
            average_size, pad_size = divmod(size, threads)
            file_exists = False
            status = []
            fh = open(tmp_filepath, 'wb')
            try:
                fh.truncate(size)
            except (OSError, IOError):
                e = truncate.format_exc()
                logger.error(e)
                self.emit('disk-error', row[FSID_COL], tmp_filepath)
                return

        # task list
        tasks = []
        # message queue
        queue = Queue()
        # threads lock
        lock = threading.RLock()
        for id_ in range(threads):
            if file_exists:
                start_size, end_size, received = status[id_]
                if start_size + received >= end_size:
                    # part of file has been downloaded
                    continue
                start_size += received
            else:
                start_size = id_ * average_size
                end_size = start_size + average_size - 1
                if id_ == threads - 1:
                    end_size = end_size + pad_size + 1
                status.append([start_size, end_size, 0])
            task = DownloadBatch(id_, queue, url, lock, start_size, end_size,
                                 fh, self.timeout)
            tasks.append(task)

        for task in tasks:
            task.start()

        try:
            conf_count = 0
            done = 0
            self.emit('started', row[FSID_COL])
            while row[STATE_COL] == State.DOWNLOADING:
                id_, received = queue.get()
                # FINISHED
                if received == BATCH_FINISISHED:
                    done += 1
                    if done == len(tasks):
                        row[STATE_COL] = State.FINISHED
                        break
                    else:
                        continue
                # error occurs
                elif received == BATCH_ERROR:
                    row[STATE_COL] = State.ERROR
                    break
                status[id_][2] += received
                conf_count += 1
                # flush data and status to disk
                if conf_count > THRESHOLD_TO_FLUSH:
                    with lock:
                        if not fh.closed:
                            fh.flush()
                    with open(conf_filepath, 'w') as fh:
                        json.dump(status, fh)
                    conf_count = 0
                received_total = sum(t[2] for t in status)
                self.emit('received', row[FSID_COL], received, received_total)
        except Exception:
            logger.error(traceback.format_exc())
            row[STATE_COL] = State.ERROR
        with lock:
            if not fh.closed:
                fh.close()
        for task in tasks:
            if task.isAlive():
                task.stop()
        with open(conf_filepath, 'w') as fh:
            json.dump(status, fh)

        if row[STATE_COL] == State.CANCELED:
            os.remove(tmp_filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
        elif row[STATE_COL] == State.ERROR:
            self.emit('network-error', row[FSID_COL])
        elif row[STATE_COL] == State.FINISHED:
            self.emit('downloaded', row[FSID_COL])
            os.rename(tmp_filepath, filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
Exemple #2
0
    def download(self):
        row = self.row
        if not os.path.exists(row[SAVEDIR_COL]):
            os.makedirs(row[SAVEDIR_COL], exist_ok=True)
        filepath, tmp_filepath, conf_filepath = get_tmp_filepath(
                row[SAVEDIR_COL], row[SAVENAME_COL]) 

        if os.path.exists(filepath):
            if self.download_mode == DownloadMode.IGNORE:
                self.emit('downloaded', row[FSID_COL])
                return
            elif self.download_mode == DownloadMode.NEWCOPY:
                name, ext = os.path.splitext(filepath)
                filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext)

        url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL])
        if not url:
            print('Error: Failed to get download link')
            row[STATE_COL] = State.ERROR
            self.emit('network-error', row[FSID_COL])
            return

        if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath):
            with open(conf_filepath) as conf_fh:
                status = json.load(conf_fh)
            threads = len(status)
            file_exists = True
            fh = open(tmp_filepath, 'ab')
        else:
            req = request.urlopen(url)
            if not req:
                self.emit('network-error', row[FSID_COL])
                return
            content_length = req.getheader('Content-Length')
            # Fixed: baiduPCS using non iso-8859-1 codec in http headers
            if not content_length:
                match = re.search('\sContent-Length:\s*(\d+)', str(req.headers))
                if not match:
                    self.emit('network-error', row[FSID_COL])
                    return
                content_length = match.group(1)
            size = int(content_length)
            threads = self.default_threads
            average_size, pad_size = divmod(size, threads)
            file_exists = False
            status = []
            fh = open(tmp_filepath, 'wb')
            fh.truncate(size)

        # task list
        tasks = []
        # message queue
        queue = Queue()
        # threads lock
        lock = threading.RLock()
        for id_ in range(threads):
            if file_exists:
                start_size, end_size, received = status[id_]
                start_size += received
            else:
                start_size = id_ * average_size
                end_size = start_size + average_size - 1
                if id_ == threads - 1:
                    end_size = end_size + pad_size + 1
                status.append([start_size, end_size, 0])
            task = DownloadBatch(id_, queue, url, lock, start_size, end_size,
                                 fh, self.timeout)
            tasks.append(task)

        for task in tasks:
            task.start()

        try:
            conf_count = 0
            done = 0
            self.emit('started', row[FSID_COL])
            while row[STATE_COL] == State.DOWNLOADING:
                id_, received = queue.get()
                # FINISHED
                if received == BATCH_FINISISHED:
                    done += 1
                    if done == len(status):
                        row[STATE_COL] = State.FINISHED
                        break
                    else:
                        continue
                elif received == BATCH_ERROR:
                    row[STATE_COL] = State.ERROR
                    break
                status[id_][2] += received
                conf_count += 1
                if conf_count > THRESHOLD_TO_FLUSH:
                    with open(conf_filepath, 'w') as fh:
                        fh.write(json.dumps(status))
                    conf_count = 0
                received_total = sum(t[2] for t in status)
                self.emit('received', row[FSID_COL], received_total)
        except Exception as e:
            print(e)
            for task in tasks:
                task.stop()
            row[STATE_COL] = State.ERROR
        fh.close()
        with open(conf_filepath, 'w') as fh:
            fh.write(json.dumps(status))

        for task in tasks:
            if not task.isAlive():
                task.stop()

        if row[STATE_COL] == State.CANCELED:
            os.remove(tmp_filepah)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
        elif row[STATE_COL] == State.FINISHED:
            self.emit('downloaded', row[FSID_COL])
            os.rename(tmp_filepath, filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
Exemple #3
0
    def download(self):
        row = self.row
        if not os.path.exists(row[SAVEDIR_COL]):
            os.makedirs(row[SAVEDIR_COL], exist_ok=True)
        filepath, tmp_filepath, conf_filepath = get_tmp_filepath(
            row[SAVEDIR_COL], row[SAVENAME_COL])

        if os.path.exists(filepath):
            if self.download_mode == DownloadMode.IGNORE:
                self.emit('downloaded', row[FSID_COL])
                logger.debug('File exists, ignored!')
                return
            elif self.download_mode == DownloadMode.NEWCOPY:
                name, ext = os.path.splitext(filepath)
                filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext)

        url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL])
        if not url:
            row[STATE_COL] = State.ERROR
            self.emit('network-error', row[FSID_COL])
            logger.warn('Failed to get url to download')
            return

        if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath):
            with open(conf_filepath) as conf_fh:
                status = json.load(conf_fh)
            threads = len(status)
            file_exists = True
            fh = open(tmp_filepath, 'rb+')
            fh.seek(0)
        else:
            req = net.urlopen_simple(url)
            if not req:
                logger.warn('Failed to get url to download')
                self.emit('network-error', row[FSID_COL])
                return
            content_length = req.getheader('Content-Length')
            # Fixed: baiduPCS using non iso-8859-1 codec in http headers
            if not content_length:
                match = re.search('\sContent-Length:\s*(\d+)',
                                  str(req.headers))
                if not match:
                    logger.warn('Failed to get url to download')
                    self.emit('network-error', row[FSID_COL])
                    return
                content_length = match.group(1)
            size = int(content_length)
            if size == 0:
                open(filepath, 'a').close()
                self.emit('downloaded', row[FSID_COL])
                return
            elif size <= SMALL_FILE_SIZE:
                threads = 1
            else:
                threads = self.default_threads
            average_size, pad_size = divmod(size, threads)
            file_exists = False
            status = []
            fh = open(tmp_filepath, 'wb')
            try:
                fh.truncate(size)
            except (OSError, IOError):
                e = truncate.format_exc()
                logger.error(e)
                self.emit('disk-error', row[FSID_COL], tmp_filepath)
                return

        # task list
        tasks = []
        # message queue
        queue = Queue()
        # threads lock
        lock = threading.RLock()
        for id_ in range(threads):
            if file_exists:
                start_size, end_size, received = status[id_]
                if start_size + received >= end_size:
                    # part of file has been downloaded
                    continue
                start_size += received
            else:
                start_size = id_ * average_size
                end_size = start_size + average_size - 1
                if id_ == threads - 1:
                    end_size = end_size + pad_size + 1
                status.append([start_size, end_size, 0])
            task = DownloadBatch(id_, queue, url, lock, start_size, end_size,
                                 fh, self.timeout)
            tasks.append(task)

        for task in tasks:
            task.start()

        try:
            conf_count = 0
            done = 0
            self.emit('started', row[FSID_COL])
            while row[STATE_COL] == State.DOWNLOADING:
                id_, received = queue.get()
                # FINISHED
                if received == BATCH_FINISISHED:
                    done += 1
                    if done == len(tasks):
                        row[STATE_COL] = State.FINISHED
                        break
                    else:
                        continue
                # error occurs
                elif received == BATCH_ERROR:
                    row[STATE_COL] = State.ERROR
                    break
                status[id_][2] += received
                conf_count += 1
                # flush data and status to disk
                if conf_count > THRESHOLD_TO_FLUSH:
                    with lock:
                        if not fh.closed:
                            fh.flush()
                    with open(conf_filepath, 'w') as fh:
                        json.dump(status, fh)
                    conf_count = 0
                received_total = sum(t[2] for t in status)
                self.emit('received', row[FSID_COL], received, received_total)
        except Exception:
            logger.error(traceback.format_exc())
            row[STATE_COL] = State.ERROR
        with lock:
            if not fh.closed:
                fh.close()
        for task in tasks:
            if task.isAlive():
                task.stop()
        with open(conf_filepath, 'w') as fh:
            json.dump(status, fh)

        if row[STATE_COL] == State.CANCELED:
            if os.path.exists(tmp_filepath):
                os.remove(tmp_filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
        elif row[STATE_COL] == State.ERROR:
            self.emit('network-error', row[FSID_COL])
        elif row[STATE_COL] == State.FINISHED:
            self.emit('downloaded', row[FSID_COL])
            os.rename(tmp_filepath, filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
Exemple #4
0
    def download(self):
        row = self.row
        if not os.path.exists(row[SAVEDIR_COL]):
            os.makedirs(row[SAVEDIR_COL], exist_ok=True)
        filepath, tmp_filepath, conf_filepath = get_tmp_filepath(
            row[SAVEDIR_COL], row[SAVENAME_COL])

        if os.path.exists(filepath):
            if self.download_mode == DownloadMode.IGNORE:
                self.emit('downloaded', row[FSID_COL])
                return
            elif self.download_mode == DownloadMode.NEWCOPY:
                name, ext = os.path.splitext(filepath)
                filepath = '{0}_{1}{2}'.format(name, util.curr_time(), ext)

        url = pcs.get_download_link(self.cookie, self.tokens, row[PATH_COL])
        if not url:
            print('Error: Failed to get download link')
            row[STATE_COL] = State.ERROR
            self.emit('network-error', row[FSID_COL])
            return

        if os.path.exists(conf_filepath) and os.path.exists(tmp_filepath):
            with open(conf_filepath) as conf_fh:
                status = json.load(conf_fh)
            threads = len(status)
            file_exists = True
            fh = open(tmp_filepath, 'ab')
        else:
            req = request.urlopen(url)
            if not req:
                self.emit('network-error', row[FSID_COL])
                return
            content_length = req.getheader('Content-Length')
            # Fixed: baiduPCS using non iso-8859-1 codec in http headers
            if not content_length:
                match = re.search('\sContent-Length:\s*(\d+)',
                                  str(req.headers))
                if not match:
                    self.emit('network-error', row[FSID_COL])
                    return
                content_length = match.group(1)
            size = int(content_length)
            threads = self.default_threads
            average_size, pad_size = divmod(size, threads)
            file_exists = False
            status = []
            fh = open(tmp_filepath, 'wb')
            fh.truncate(size)

        # task list
        tasks = []
        # message queue
        queue = Queue()
        # threads lock
        lock = threading.RLock()
        for id_ in range(threads):
            if file_exists:
                start_size, end_size, received = status[id_]
                start_size += received
            else:
                start_size = id_ * average_size
                end_size = start_size + average_size - 1
                if id_ == threads - 1:
                    end_size = end_size + pad_size + 1
                status.append([start_size, end_size, 0])
            task = DownloadBatch(id_, queue, url, lock, start_size, end_size,
                                 fh, self.timeout)
            tasks.append(task)

        for task in tasks:
            task.start()

        try:
            conf_count = 0
            done = 0
            self.emit('started', row[FSID_COL])
            while row[STATE_COL] == State.DOWNLOADING:
                id_, received = queue.get()
                # FINISHED
                if received == BATCH_FINISISHED:
                    done += 1
                    if done == len(status):
                        row[STATE_COL] = State.FINISHED
                        break
                    else:
                        continue
                elif received == BATCH_ERROR:
                    row[STATE_COL] = State.ERROR
                    break
                status[id_][2] += received
                conf_count += 1
                if conf_count > THRESHOLD_TO_FLUSH:
                    with open(conf_filepath, 'w') as fh:
                        fh.write(json.dumps(status))
                    conf_count = 0
                received_total = sum(t[2] for t in status)
                self.emit('received', row[FSID_COL], received_total)
        except Exception as e:
            print(e)
            for task in tasks:
                task.stop()
            row[STATE_COL] = State.ERROR
        fh.close()
        with open(conf_filepath, 'w') as fh:
            fh.write(json.dumps(status))

        for task in tasks:
            if not task.isAlive():
                task.stop()

        if row[STATE_COL] == State.CANCELED:
            os.remove(tmp_filepah)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)
        elif row[STATE_COL] == State.FINISHED:
            self.emit('downloaded', row[FSID_COL])
            os.rename(tmp_filepath, filepath)
            if os.path.exists(conf_filepath):
                os.remove(conf_filepath)