def _copy_chunks(self): init = self.info.get_chunk_name(0) # initial chunk name if self.info.get_count() > 1: with io.open(init, 'rb+') as fpo: # first chunkfile for i in range(1, self.info.get_count()): # input file # seek to beginning of chunk, # to get rid of overlapping chunks fpo.seek(self.info.get_chunk_range(i - 1)[1] + 1) filename = '{0}.chunk{1:d}'.format(self.path, i) buf = 32 << 10 with io.open(filename, mode='rb') as fpi: while True: # copy in chunks, consumes less memory data = fpi.read(buf) if not data: break fpo.write(data) if fpo.tell() < self.info.get_chunk_range(i)[1]: remove(init) self.info.remove() # there are probably invalid chunks raise Exception( 'Downloaded content was smaller than expected') remove(filename) # remove chunk if self.name: filepath = os.path.join(os.path.dirname(self.path), self.name) self.set_path(filepath) shutil.move(init, self.path)
def setUp(self): PluginTester.setUp(self) for f in self.files: if exists(save_join(DL_DIR, f)): remove(save_join(DL_DIR, f)) # folder for reports report = join("tmp", self.__class__.__name__) if exists(report): for f in listdir(report): remove(join(report, f))
def setUp(self): PluginTester.setUp(self) for fname in self.files: path = os.path.join(DL_DIR, fname) remove(path, trash=True, ignore_errors=True) # folder for reports report = os.path.join(self.__class__.__name__) if not os.path.exists(report): return None for fname in os.listdir(report): path = os.path.join(report, fname) remove(path, trash=True)
def _decrypt(self, urls): """Internal method to select decrypting method :param urls: List of urls/content :return: """ cls = self.__class__ # separate local and remote files content, urls = self.getLocalContent(urls) result = [] if urls and has_method(cls, "decrypt"): self.logDebug("Deprecated .decrypt() method in Crypter plugin") result = [] for url in urls: self.pyfile = PyFileMockup(url) self.setup() self.decrypt(self.pyfile) result.extend(self.convertPackages()) elif urls: method = True try: self.setup() result = to_list(self.decryptURLs(urls)) except NotImplementedError: method = False # this will raise error if not implemented if not method: for url in urls: self.setup() result.extend(to_list(self.decryptURL(url))) for f, c in content: self.setup() result.extend(to_list(self.decryptFile(c))) try: if f.startswith("tmp_"): remove(f) except IOError: self.logWarning(_("Could not remove file '%s'") % f) self.core.print_exc() return result
def _decrypt(self, urls): """Internal method to select decrypting method :param urls: List of urls/content :return: """ cls = self.__class__ # separate local and remote files content, urls = self.getLocalContent(urls) if has_method(cls, "decryptURLs"): self.setup() result = to_list(self.decryptURLs(urls)) elif has_method(cls, "decryptURL"): result = [] for url in urls: self.setup() result.extend(to_list(self.decryptURL(url))) elif has_method(cls, "decrypt"): self.logDebug("Deprecated .decrypt() method in Crypter plugin") result = [] for url in urls: self.pyfile = PyFileMockup(url) self.setup() self.decrypt(self.pyfile) result.extend(self.convertPackages()) else: if not has_method(cls, "decryptFile") or urls: self.logDebug("No suited decrypting method was overwritten in plugin") result = [] if has_method(cls, "decryptFile"): for f, c in content: self.setup() result.extend(to_list(self.decryptFile(c))) try: if f.startswith("tmp_"): remove(f) except IOError: self.logWarning(_("Could not remove file '%s'") % f) self.core.print_exc() return result
def checkDownload(self, rules, api_size=0, max_size=50000, delete=True, read_size=0): """ checks the content of the last downloaded file, re match is saved to `lastCheck` :param rules: dict with names and rules to match (compiled regexp or strings) :param api_size: expected file size :param max_size: if the file is larger then it wont be checked :param delete: delete if matched :param read_size: amount of bytes to read from files larger then max_size :return: dictionary key of the first rule that matched """ lastDownload = fs_encode(self.lastDownload) if not exists(lastDownload): return None size = stat(lastDownload) size = size.st_size if api_size and api_size <= size: return None elif size > max_size and not read_size: return None self.log.debug("Download Check triggered") f = open(lastDownload, "rb") content = f.read(read_size if read_size else -1) f.close() # produces encoding errors, better log to other file in the future? # self.log.debug("Content: %s" % content) for name, rule in rules.iteritems(): if type(rule) in (str, unicode): if rule in content: if delete: remove(lastDownload) return name elif hasattr(rule, "search"): m = rule.search(content) if m: if delete: remove(lastDownload) self.lastCheck = m return name
def check_download(self, rules, api_size=0, max_size=50000, delete=True, read_size=0): """Checks the content of the last downloaded file, re match is saved to `last_check` :param rules: dict with names and rules to match (compiled regexp or strings) :param api_size: expected file size :param max_size: if the file is larger then it wont be checked :param delete: delete if matched :param read_size: amount of bytes to read from files larger then max_size :return: dictionary key of the first rule that matched """ if not os.path.isfile(self.last_download): return size = os.stat(self.last_download).st_size if api_size and api_size <= size: return elif size > max_size and not read_size: return self.pyload.log.debug('Download Check triggered') with io.open(self.last_download, mode='rb') as fp: content = fp.read(read_size if read_size else -1) # produces encoding errors, better log to other file in the future? # self.pyload.log.debug("Content: {0}".format(content)) for name, rule in rules.items(): if isinstance(rule, str): if rule in content: if delete: remove(self.last_download, trash=True) return name elif hasattr(rule, 'search'): m = rule.search(content) if m is not None: if delete: remove(self.last_download, trash=True) self.last_check = m return name
def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): # self.log.debug("create tmp tif") # tmp = tempfile.NamedTemporaryFile(suffix=".tif") tmp_path = os.path.join('tmpTif_{0}.tif'.format(self.__name__)) tmp = io.open(tmp_path, mode='wb') tmp.close() # self.log.debug("create tmp txt") # tmp_txt = tempfile.NamedTemporaryFile(suffix=".txt") tmp_txt_path = os.path.join('tmp_txt_{0}.txt'.format( self.__name__)) tmp_txt = io.open(tmp_txt_path, mode='wb') tmp_txt.close() self.log.debug('save tiff') self.image.save(tmp.name, 'TIFF') if os.name == 'nt': tessparams = [resource_filename( __package__, 'tesseract/tesseract.exe')] else: tessparams = ['tesseract'] tessparams.extend([tmp.name, tmp_txt.name.replace('.txt', '')]) if subset and (digits or lowercase or uppercase): # self.log.debug("create temp subset config") # tmp_sub = tempfile.NamedTemporaryFile(suffix=".subset") with io.open(os.path.join('tmp_sub_{0}.subset'.format(self.__name__)), mode='wb') as tmp_sub: tmp_sub.write('tessedit_char_whitelist ') if digits: tmp_sub.write('0123456789') if lowercase: tmp_sub.write('abcdefghijklmnopqrstuvwxyz') if uppercase: tmp_sub.write('ABCDEFGHIJKLMNOPQRSTUVWXYZ') tmp_sub.write(os.linesep) tessparams.append('nobatch') tessparams.append(tmp_sub.name) self.log.debug('run tesseract') self.run(tessparams) self.log.debug('read txt') try: with io.open(tmp_txt.name) as fp: self.result_captcha = fp.read().replace(os.linesep, '') except Exception: self.result_captcha = '' self.log.debug(self.result_captcha) try: remove(tmp.name) remove(tmp_txt.name) if subset and (digits or lowercase or uppercase): remove(tmp_sub.name) except OSError: pass
def setUpClass(cls): cls.core = Core() name = '{0}.{1}'.format(cls.__module__, cls.__name__) for fname in glob(os.path.join(name, 'debug_*')): remove(fname, trash=True)
def decryptCaptcha( self, url, get={}, post={}, cookies=False, forceUser=False, imgtype="jpg", result_type="textual" ): """ Loads a captcha and decrypts it with ocr, plugin, user input :param url: url of captcha image :param get: get part for request :param post: post part for request :param cookies: True if cookies should be enabled :param forceUser: if True, ocr is not used :param imgtype: Type of the Image :param result_type: 'textual' if text is written on the captcha\ or 'positional' for captcha where the user have to click\ on a specific region on the captcha :return: result of decrypting """ img = self.load(url, get=get, post=post, cookies=cookies) id = ("%.2f" % time())[-6:].replace(".", "") temp_file = open(join("tmp", "tmpCaptcha_%s_%s.%s" % (self.__name__, id, imgtype)), "wb") temp_file.write(img) temp_file.close() name = "%sOCR" % self.__name__ has_plugin = name in self.core.pluginManager.getPlugins("internal") if self.core.captcha: OCR = self.core.pluginManager.loadClass("internal", name) else: OCR = None if OCR and not forceUser: sleep(randint(3000, 5000) / 1000.0) self.checkAbort() ocr = OCR() result = ocr.get_captcha(temp_file.name) else: task = self.im.createCaptchaTask(img, imgtype, temp_file.name, self.__name__, result_type) self.task = task while task.isWaiting(): if self.abort(): self.im.removeTask(task) raise Abort() sleep(1) # TODO task handling self.im.removeTask(task) if task.error and has_plugin: # ignore default error message since the user could use OCR self.fail(_("Pil and tesseract not installed and no Client connected for captcha decrypting")) elif task.error: self.fail(task.error) elif not task.result: self.fail(_("No captcha result obtained in appropriate time.")) result = task.result self.log.debug("Received captcha result: %s" % str(result)) if not self.core.debug: try: remove(temp_file.name) except: pass return result
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.add_chunk('{0}.chunk0'.format( self.path), (0, 0)) # create an initial entry self.chunks = [] # initial chunk that will load complete file (if needed) init = CurlChunk(0, self, None, resume) self.chunks.append(init) self.manager.add_handle(init.get_handle()) last_finish_check = 0 last_time_check = 0 chunks_done = set() # list of curl handles that are finished chunks_created = False done = False # This is a resume, if we were chunked originally assume still can if self.info.get_count() > 1: self.chunk_support = True while True: # need to create chunks # will be set later by first chunk if not chunks_created and self.chunk_support and self.size: self.flags ^= Connection.Resumable # TODO: Recheck... if not resume: self.info.set_size(self.size) self.info.create_chunks(chunks) self.info.save() chunks = self.info.get_count() init.set_range(self.info.get_chunk_range(0)) for i in range(1, chunks): c = CurlChunk( i, self, self.info.get_chunk_range(i), resume) handle = c.get_handle() if handle: self.chunks.append(c) self.manager.add_handle(handle) else: # close immediately self.log.debug('Invalid curl handle -> closed') c.close() chunks_created = True while True: ret, _ = self.manager.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time.time() # reduce these calls # when num_q is 0, the loop is exited while last_finish_check + 0.5 < t: # list of failed curl handles failed = [] # TODO: Rewrite... # save only last exception, we can only raise one anyway exc = Exception() num_q, ok_list, err_list = self.manager.info_read() for c in ok_list: chunk = self.find_chunk(c) # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as exc: self.log.debug( 'Chunk {0:d} failed'.format( chunk.id + 1)) self.log.debug(exc, exc_info=True) failed.append(chunk) else: chunks_done.add(c) for c in err_list: curl, errno, msg = c chunk = self.find_chunk(curl) # test if chunk was finished if errno != 23 or '0 !=' not in msg: failed.append(chunk) exc = pycurl.error(errno, msg) self.log.debug( 'Chunk {0:d} failed'.format(chunk.id + 1)) self.log.debug(exc, exc_info=True) continue # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as exc: self.log.debug( 'Chunk {0:d} failed'.format( chunk.id + 1)) self.log.debug(exc, exc_info=True) failed.append(chunk) else: chunks_done.add(curl) if not num_q: # no more info to get # check if init is not finished so we reset download # connections # note that other chunks are closed and everything # downloaded with initial connection if failed: if init in failed or init.curl in chunks_done: raise exc self.log.error( 'Download chunks failed, fallback to ' 'single connection | {0}'.format(exc)) # list of chunks to clean and remove to_clean = [x for x in self.chunks if x is not init] for chunk in to_clean: self.close_chunk(chunk) self.chunks.remove(chunk) remove(self.info.get_chunk_name(chunk.id)) # let first chunk load the rest and update the # info file init.reset_range() self.info.clear() self.info.add_chunk('{0}.chunk0'.format( self.path), (0, self.size)) self.info.save() last_finish_check = t if len(chunks_done) >= len(self.chunks): if len(chunks_done) > len(self.chunks): self.log.warning( 'Finished download chunks size incorrect') done = True # all chunks loaded break if done: break # all chunks loaded # calc speed once per second, averaging over 3 seconds if last_time_check + 1 < t: len_la = len(self.last_arrived) diff = [c.arrived - (self.last_arrived[i] if len_la > i else 0) for i, c in enumerate(self.chunks)] self.last_speeds[1] = self.last_speeds[0] self.last_speeds[0] = self.speeds self.speeds = [float(a) // (t - last_time_check) for a in diff] self.last_arrived = [c.arrived for c in self.chunks] last_time_check = t if self._abort: raise Abort self.manager.select(1) for chunk in self.chunks: chunk.flush_file() # make sure downloads are written to disk self._copy_chunks()
def run_tesser(self, subset=False, digits=True, lowercase=True, uppercase=True): # self.log.debug("create tmp tif") # tmp = tempfile.NamedTemporaryFile(suffix=".tif") tmp_path = os.path.join("tmpTif_{0}.tif".format(self.__name__)) tmp = lopen(tmp_path, mode='wb') tmp.close() # self.log.debug("create tmp txt") # tmp_txt = tempfile.NamedTemporaryFile(suffix=".txt") tmp_txt_path = os.path.join("tmp_txt_{0}.txt".format(self.__name__)) tmp_txt = lopen(tmp_txt_path, mode='wb') tmp_txt.close() self.log.debug("save tiff") self.image.save(tmp.name, 'TIFF') if os.name == 'nt': tessparams = [ resource_filename(__package__, 'tesseract/tesseract.exe') ] else: tessparams = ['tesseract'] tessparams.extend([tmp.name, tmp_txt.name.replace(".txt", "")]) if subset and (digits or lowercase or uppercase): # self.log.debug("create temp subset config") # tmp_sub = tempfile.NamedTemporaryFile(suffix=".subset") with lopen(os.path.join("tmp_sub_{0}.subset".format( self.__name__)), mode='wb') as tmp_sub: tmp_sub.write("tessedit_char_whitelist ") if digits: tmp_sub.write("0123456789") if lowercase: tmp_sub.write("abcdefghijklmnopqrstuvwxyz") if uppercase: tmp_sub.write("ABCDEFGHIJKLMNOPQRSTUVWXYZ") tmp_sub.write(os.linesep) tessparams.append("nobatch") tessparams.append(tmp_sub.name) self.log.debug("run tesseract") self.run(tessparams) self.log.debug("read txt") try: with lopen(tmp_txt.name) as fp: self.result_captcha = fp.read().replace(os.linesep, "") except Exception: self.result_captcha = "" self.log.debug(self.result_captcha) try: remove(tmp.name) remove(tmp_txt.name) if subset and (digits or lowercase or uppercase): remove(tmp_sub.name) except Exception: pass
def decryptCaptcha(self, url, get={}, post={}, cookies=False, forceUser=False, imgtype='jpg', result_type='textual'): """ Loads a captcha and decrypts it with ocr, plugin, user input :param url: url of captcha image :param get: get part for request :param post: post part for request :param cookies: True if cookies should be enabled :param forceUser: if True, ocr is not used :param imgtype: Type of the Image :param result_type: 'textual' if text is written on the captcha\ or 'positional' for captcha where the user have to click\ on a specific region on the captcha :return: result of decrypting """ img = self.load(url, get=get, post=post, cookies=cookies) id = ("%.2f" % time())[-6:].replace(".", "") temp_file = open( join("tmp", "tmpCaptcha_%s_%s.%s" % (self.__name__, id, imgtype)), "wb") temp_file.write(img) temp_file.close() name = "%sOCR" % self.__name__ has_plugin = name in self.core.pluginManager.getPlugins("internal") if self.core.captcha: OCR = self.core.pluginManager.loadClass("internal", name) else: OCR = None if OCR and not forceUser: sleep(randint(3000, 5000) / 1000.0) self.checkAbort() ocr = OCR() result = ocr.get_captcha(temp_file.name) else: task = self.im.createCaptchaTask(img, imgtype, temp_file.name, self.__name__, result_type) self.task = task while task.isWaiting(): if self.abort(): self.im.removeTask(task) raise Abort() sleep(1) #TODO task handling self.im.removeTask(task) if task.error and has_plugin: #ignore default error message since the user could use OCR self.fail( _("Pil and tesseract not installed and no Client connected for captcha decrypting" )) elif task.error: self.fail(task.error) elif not task.result: self.fail(_("No captcha result obtained in appropriate time.")) result = task.result self.log.debug("Received captcha result: %s" % str(result)) if not self.core.debug: try: remove(temp_file.name) except: pass return result
def decrypt_captcha(self, url, get={}, post={}, cookies=True, forceuser=False, imgtype='jpg', result_type='textual'): """ Loads a captcha and decrypts it with ocr, plugin, user input :param url: url of captcha image :param get: get part for request :param post: post part for request :param cookies: True if cookies should be enabled :param forceuser: if True, ocr is not used :param imgtype: Type of the Image :param result_type: 'textual' if text is written on the captcha or 'positional' for captcha where the user have to click on a specific region on the captcha :return: result of decrypting """ img = self.load(url, get=get, post=post, cookies=cookies) id = "{0:.2f}".format(time.time())[-6:].replace(".", "") with lopen(os.path.join("tmp_captcha_{0}_{1}.{2}".format( self.__name__, id, imgtype)), mode='wb') as fp: fp.write(img) name = "{0}OCR".format(self.__name__) has_plugin = name in self.__pyload.pgm.get_plugins("internal") if self.__pyload.captcha: OCR = self.__pyload.pgm.load_class("internal", name) else: OCR = None if OCR and not forceuser: time.sleep(random.randint(3000, 5000) // 1000) self.check_abort() ocr = OCR() result = ocr.get_captcha(fp.name) else: task = self.__pyload.exm.create_captcha_task( img, imgtype, fp.name, self.__name__, result_type) self.task = task while task.is_waiting(): if self.abort(): self.__pyload.exm.remove_task(task) raise Abort time.sleep(1) # TODO: task handling self.__pyload.exm.remove_task(task) if task.error and has_plugin: # ignore default error message since the user could use OCR self.fail( self. _("Pil and tesseract not installed and no Client connected for captcha decrypting" )) elif task.error: self.fail(task.error) elif not task.result: self.fail( self._( "No captcha result obtained in appropriate time")) result = task.result self.__pyload.log.debug( "Received captcha result: {0}".format(result)) if not self.__pyload.debug: try: remove(fp.name) except Exception: pass return result
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.add_chunk("{0}.chunk0".format( self.path), (0, 0)) # create an initial entry self.chunks = [] # initial chunk that will load complete file (if needed) init = CurlChunk(0, self, None, resume) self.chunks.append(init) self.__manager.add_handle(init.get_handle()) last_finish_check = 0 last_time_check = 0 chunks_done = set() # list of curl handles that are finished chunks_created = False done = False # This is a resume, if we were chunked originally assume still can if self.info.get_count() > 1: self.chunk_support = True while True: # need to create chunks # will be set later by first chunk if not chunks_created and self.chunk_support and self.size: self.flags ^= Connection.Resumable # TODO: Recheck... if not resume: self.info.set_size(self.size) self.info.create_chunks(chunks) self.info.save() chunks = self.info.get_count() init.set_range(self.info.get_chunk_range(0)) for i in range(1, chunks): c = CurlChunk( i, self, self.info.get_chunk_range(i), resume) handle = c.get_handle() if handle: self.chunks.append(c) self.__manager.add_handle(handle) else: # close immediately self.log.debug("Invalid curl handle -> closed") c.close() chunks_created = True while True: ret, num_handles = self.__manager.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time.time() # reduce these calls # when num_q is 0, the loop is exited while last_finish_check + 0.5 < t: # list of failed curl handles failed = [] # TODO: Rewrite... # save only last exception, we can only raise one anyway ex = Exception() num_q, ok_list, err_list = self.__manager.info_read() for c in ok_list: chunk = self.find_chunk(c) # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as e: self.log.debug( "Chunk {0:d} failed: {1}".format( chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunks_done.add(c) for c in err_list: curl, errno, msg = c chunk = self.find_chunk(curl) # test if chunk was finished if errno != 23 or "0 !=" not in msg: failed.append(chunk) ex = pycurl.error(errno, msg) self.log.debug( "Chunk {0:d} failed: {1}".format(chunk.id + 1, ex)) continue # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as e: self.log.debug( "Chunk {0:d} failed: {1}".format( chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunks_done.add(curl) if not num_q: # no more info to get # check if init is not finished so we reset download # connections # note that other chunks are closed and everything # downloaded with initial connection if failed: if init in failed or init.c in chunks_done: raise ex self.log.error( "Download chunks failed, fallback to " "single connection | {0}".format(ex)) # list of chunks to clean and remove to_clean = [x for x in self.chunks if x is not init] for chunk in to_clean: self.close_chunk(chunk) self.chunks.remove(chunk) remove(self.info.get_chunk_name(chunk.id)) # let first chunk load the rest and update the # info file init.reset_range() self.info.clear() self.info.add_chunk("{0}.chunk0".format( self.path), (0, self.size)) self.info.save() last_finish_check = t if len(chunks_done) >= len(self.chunks): if len(chunks_done) > len(self.chunks): self.log.warning( "Finished download chunks size incorrect") done = True # all chunks loaded break if done: break # all chunks loaded # calc speed once per second, averaging over 3 seconds if last_time_check + 1 < t: len_la = len(self.last_arrived) diff = [c.arrived - (self.last_arrived[i] if len_la > i else 0) for i, c in enumerate(self.chunks)] self.last_speeds[1] = self.last_speeds[0] self.last_speeds[0] = self.speeds self.speeds = [float(a) // (t - last_time_check) for a in diff] self.last_arrived = [c.arrived for c in self.chunks] last_time_check = t if self.__abort: raise Abort self.__manager.select(1) for chunk in self.chunks: chunk.flush_file() # make sure downloads are written to disk self._copy_chunks()