def _networkM3U8(self, url: str) -> None: url = url.replace("www.", "").removeprefix("https://") filmName = re.findall( r"[altadefinizione\..*|altadefinizionecommunity\.net]\/(.*/)", url) serieName = re.findall(r"seriehd\..*\/(.*/)", url) if filmName != []: out = self.output() self.root.destroy() film = Film(filmName[0], out) if film.init[0] == 'created': Downloader(film.outputFile) elif film.init[0] == "continuare": messagebox.showinfo("4K Downloader", "Download già iniziato, verrà ripreso") Downloader(film.outputFile) elif film.init[0] == "sostituire": s = messagebox.askyesno( "4K Downloader", "Nome già esistente ma i file sono diversi.\nSi -> Indicizza file\nNo -> Sostituisci file" ) if s: film.outputFile = film.outputFile.duplicate() else: film.outputFile.temp().remove() Data.delete(film.outputFile.name) Film.initialize(film.outputFile, film.m3u8Path) Downloader(film.outputFile)
def main(): usage = "Please use -s to specify a search term, and -r to specify" + \ " the desired result name" try: opts, args = getopt.getopt(sys.argv[1:], "s:r:h") except getopt.GetoptError: print usage sys.exit(1) search_term = None desired_item_name = None for opt, arg in opts: if opt == '-s': search_term = arg elif opt == '-r': desired_item_name = arg elif opt == '-h': print usage return 0 if not desired_item_name or not search_term: try: search_term = raw_input('Search Term: ') desired_item_name = raw_input('Desired Item Name: ') except EOFError: return 0 downloader = Downloader() downloader.download(search_term, desired_item_name) return 0
def __init__(self): """Init main class.""" doScrape = True self.load_config() self.init_config() self.myDB = DBManager(self.config["BASIC"]["DBLocation"]) self.plugin_source = self.plugin_base.make_plugin_source( searchpath=['./scrapers']) for plugin_name in self.plugin_source.list_plugins(): plugin = self.plugin_source.load_plugin(plugin_name) plugin.Scraper().setup(self) doScrape = "-noscrape" not in sys.argv doDl = "-nodl" not in sys.argv if "-add" in sys.argv: i = sys.argv.index("-add") newShow = sys.argv[i + 1] newLang = sys.argv[i + 2] self.myDB.orderShow(newShow, newLang) if doScrape: self.scrape() self.save_config() self.myDB.matchEpisodes() if doDl: self.myDL = Downloader(self) self.myDL.downloadQueue()
def downloadVideo(self, url, title): common.log('Trying to download video ' + str(url)) # check url if url.startswith('plugin'): common.log('Video is not downloadable') return None path = common.getSetting('download_path') if not path: path = common.browseFolders(common.translate(30017)) common.setSetting('download_path', path) title = getKeyboard(default=fu.cleanFilename(title), heading='SportsDevil') if title == None or title == '': return None downloader = Downloader() downloaded_file = downloader.downloadMovie(url, path, fu.cleanFilename(title), '.flv') if downloaded_file == None: common.log('Download cancelled') else: common.log('Video ' + url + " downloaded to '" + downloaded_file + "'") return downloaded_file
def OnPcdownBtnButton(self, event): """ 다운받은 이미지를 로컬 PC에 저장 (cyro) """ # 폴더 선택 여부 체크 if self.targetDir == "": dlg = wx.MessageDialog(self, '사진을 저장할 폴더를 먼저 선택하세요.', '알림', wx.OK | wx.ICON_INFORMATION) try: result = dlg.ShowModal() self.selectTargetDirBtn.SetFocus() return finally: dlg.Destroy() temp = open("goodbyeCy.log", "w") temp.close() tools.setButtonState(self, self.STATE_STARTED) self.isRunning = True self.downloader = Downloader(self, True) # down/up loader 동시에 실행....producer/consumer 모델 참조.... self.downloader.start() self.monitoringTimer.Start(1000) event.Skip()
def link_crawler(seed_url,link_regx=None,delay=5,user_agent='wswp',proxies=None,max_depth=-1,max_urls=-1,scrape_callback=None,num_retries=1,cache=None): crawl_queue =[seed_url] seen = {seed_url:0} D = Downloader(delay=delay,user_agent=user_agent,proxies=proxies,num_retries=num_retries,cache=cache) num_urls =0 num = 0 while crawl_queue: url = crawl_queue.pop() print('url=',url) depth = seen[seed_url] html = D(url).decode('utf-8') #print('html=',html) links=[] if depth !=max_depth: if scrape_callback: scrape_callback(url,html) if link_regx: links.extend(link for link in get_links(html) if re.match(link_regx,link)) for link in links: link = 'position.php?'+link link = normalize(seed_url,link) if link not in seen: seen[link] = depth+1 if same_domain(link,seed_url): crawl_queue.append(link) #print('crawl_queue=',crawl_queue) num +=1 print('num=',num) num_urls +=1 if num_urls == max_urls: break
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() ###################### crawl_queue.clear() ###################### crawl_queue.push(seed_url) ###################### D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() ###################### except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: ############# # add this new link to queue###################### crawl_queue.push(normalize( seed_url, link)) ###################### crawl_queue.complete(url) ###################### # wait for all download threads to finish threads = [] while threads or crawl_queue: ###################### for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ####################### # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def crawl_malware_domains(url): """ This function crawls the malware domain indicator and returns all the dataset links to be downloaded and scraped later. @param url (string) url of the indicator web page @return """ print('Crawling site: ', url) downloader = Downloader() print(url) html = downloader(url) soup = BeautifulSoup(html, 'html5lib') possible_links = soup.find_all('a') htmlLinks, htmlRemovedLinks = list([]), list([]) for link in possible_links: if link.has_attr('href') and link.attrs['href'][0].isdigit(): # construct full path using function parameter url = 'https://mirror.uce.edu.ec/malwaredomains/' full_link = '{}{}'.format(url, link.attrs['href']) htmlLinks.append(full_link) elif link.has_attr('href') and link.attrs['href'].startswith( 'removed-domains-'): # in this loop we gather all the removed ip lists full_link = '{}{}'.format(url, link.attrs['href']) htmlRemovedLinks.append(full_link) return {"blocked": htmlLinks, "removed": htmlRemovedLinks}
def __init__(self): super(Main, self).__init__(load_config, NAME_FILE_CONFIG_PATH, NAME_FILE_LOG_PATH, NAME_FILE_CSV_PATH) self.loading_args() self.log_configuration() # General variable self.config = self.loading_file_config() self.errors = [] self.fields_csv = self.config.get("GENERAL", "fields_csv").split(",") list_links = parser_cvs(self.args.csv, self.fields_csv) http = Downloader() report = Report() for data in list_links: download = http.download(data["link"]) report.add_elements(download) report.print_report() logging.info("Script Completado.")
def run_server(): db.init_data() db.load_to_file() player_event = Event() thread_player = Player(player_event) thread_player.play_current_song() thread_player.setDaemon(True) thread_player.start() down_event = Event() thread_down = Downloader(down_event) thread_down.setDaemon(True) thread_down.start() notify_event = Event() thread_notify = Notify(notify_event) thread_notify.setDaemon(True) thread_notify.start() maintain_event = Event() thread_maintain = Maintain(maintain_event) thread_maintain.setDaemon(True) thread_maintain.start() run(host='0.0.0.0', port=80, server=PasteServer)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None): """Crawl from the given seed URL following links matched by link_regex """ # the queue of URL's that still need to be crawled crawl_queue = [seed_url] # the URL's that have been seen and at what depth seen = {seed_url: 0} # track how many URL's have been downloaded num_urls = 0 rp = get_robots(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) while crawl_queue: url = crawl_queue.pop() depth = seen[url] # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): html = D(url) #对网页处理 encode_type = chardet.detect(html) html = html.decode(encode_type['encoding']) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) if depth != max_depth: # can still crawl further if link_regex: # filter for links matching our regular expression links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print('Blocked by robots.txt:', url)
def __init__(self, seed_url='', link_regex='', callback=None, timeout=30, max_threads=1, max_depth=-1, permitted_domains=[], disallow=[], limited=True): # if 'www.' not in seed_url.lower(): # seed_url = seed_url.split('//')[0]+'//www.'+seed_url.split('//')[1] self.seed_url = seed_url self.link_regex = link_regex self.queue = [self.seed_url] self.seen = dict() self.callback = callback self.timeout = timeout self.max_threads = max_threads self.max_depth = max_depth self.threads = list() self.permitted_domains = permitted_domains + self.queue if isinstance(disallow, str): disallow = [disallow] self.disallow = disallow + ['mailto:'] self.download = Downloader() self.limited = limited
def run(self): self.downloader_thread = threading.Thread(target=Downloader( self.data_interface, self.data_sources[0], self.data_sources[1], self.lines, self.stop_event, self.update_interval).run) self.downloader_thread.setDaemon(True) self.downloader_thread.start() self.interact()
def test_encoding_gb18030_20160619b(): url = "http://zhidao.baidu.com/question/130330820.html" agt = Downloader("test", "http://127.0.0.1", request=True) content_unicode = agt.request_download(url) print content_unicode[:500] assert (type(content_unicode) == unicode)
async def handler(event): if event.message.media: res = event.message msg = await event.message.reply('Downloading Starting') file_name = event.message.file.name mime_type = event.message.media.document.mime_type path = os.getcwd() + '/uploads/' + file_name media = res.media.document file = types.InputDocumentFileLocation( id=media.id, access_hash=media.access_hash, file_reference=media.file_reference, thumb_size='') file_size = media.size progress = ProgressPercentage(msg, file_size) d = Downloader(client, file, file_size, media.dc_id) await d.download(path, progress) print('\n' + path) print('now uploading') await client.edit_message(msg, 'File downloaded') await client.edit_message(msg, 'Uploading Starting') loop = asyncio.get_running_loop() size = float(os.path.getsize(path)) async with sem: print('Uploading') url = await loop.run_in_executor(None, uploadFile, path, file_name, mime_type) print('Upload Complete') await client.edit_message(msg, 'Upload Complete') os.remove(path)
def main(): args = parse_argument() try: if args.socks5[0] and args.socks5[1]: if args.proxy: logger.error('invalid proxy protocol count.') raise SystemExit socks.set_default_proxy(socks.SOCKS5, args.socks5[0], int(args.socks5[1]), True, args.socks5[2], args.socks5[3]) socket.socket = socks.socksocket except Exception as e: logger.error('invalid socks5 proxy arguments.') raise SystemExit t = Thread(args.board, args.thread) if not args.downloading: thread_info = t.thread_info() logger.info('/{}/ - {} - {}'.format(args.board, thread_info['sub'], const.BOARDS[args.board])) logger.info('total images - {}'.format(thread_info['images'])) else: downloader = Downloader(path=args.path, threads=args.threads, timeout=args.timeout, is_thumb=args.thumb) q = t.detail_queue() downloader.fetch(q)
def fetchIcon(self, iconName): iconFileName = None if self.weatherIconId >= 200: iconFileName, description = WeatherIconMapper.convertIcon( self.weatherIconId, self.sunrise.time(), self.sunset.time()) print("Icon file name: {}, Description: {}".format( iconFileName, description)) if iconFileName is not None: self.icon = UiUtility.loadWeatherIcon(iconFileName) self.icon.fill(UiColors.GRAY, special_flags=pygame.BLEND_RGB_ADD) else: # This weather icon ID is not mapped to a weather icon. In this case, # fetch the icon from OpenWeatherMap downloader = Downloader(None) # TODO: Do in either a background thread, or a coroutine url = "http://openweathermap.org/img/w/{}.png".format(iconName) downloader.download(url) image = downloader.getData() # Does image need to be processed before it can be used by Pygame? memFileObj = io.BytesIO(image) self.icon = pygame.image.load(memFileObj)
async def main(args: list): config_name = args[1] backtest_config = await prep_backtest_config(config_name) if backtest_config[ 'exchange'] == 'bybit' and not backtest_config['inverse']: print('bybit usdt linear backtesting not supported') return downloader = Downloader(backtest_config) ticks = await downloader.get_ticks(True) backtest_config['n_days'] = round_( (ticks[-1][2] - ticks[0][2]) / (1000 * 60 * 60 * 24), 0.1) if (p := '--plot') in args: try: candidate = json.load(open(args[args.index(p) + 1])) print('plotting given candidate') except Exception as e: print(os.listdir(backtest_config['session_dirpath'])) try: candidate = json.load( open(backtest_config['session_dirpath'] + 'live_config.json')) print('plotting best candidate') except: return print(json.dumps(candidate, indent=4)) plot_wrap(backtest_config, ticks, candidate) return
async def main(args: list): config_name = args[1] backtest_config = await prep_backtest_config(config_name) if backtest_config[ 'exchange'] == 'bybit' and not backtest_config['inverse']: print('bybit usdt linear backtesting not supported') return downloader = Downloader(backtest_config) ticks = await downloader.get_ticks(True) backtest_config['n_days'] = round_( (ticks[-1][2] - ticks[0][2]) / (1000 * 60 * 60 * 24), 0.1) start_candidate = None if (s := '--start') in args: try: if os.path.isdir(args[args.index(s) + 1]): start_candidate = [ json.load(open(f)) for f in glob.glob( os.path.join(args[args.index(s) + 1], '*.json')) ] print('Starting with all configurations in directory.') else: start_candidate = json.load(open(args[args.index(s) + 1])) print('Starting with specified configuration.') except: print('Could not find specified configuration.')
def get(self, type, img): retImg = self.__createYzmImage(img, type) d = Downloader() data = dict( JsdmYzmPlugin.jsdm_info, **{ "captchaData": retImg, "captchaType": 1023, "captchaMinLength": 0, "captchaMaxLength": 0 }) dataPost = json.dumps(data) res = d.request_data_from_url('https://v2-api.jsdama.com/upload', 'post', dataPost, timeout=-1) j_res = json.loads(res) dataret = j_res.get('data') if dataret: recognition = dataret['recognition'] else: recognition = None self.getdmResult = res print('验证码自动识别:[', recognition, ']') return recognition
def link_crawler(seed_url,link_regex=None,delay=5,max_depth=1,max_urls=-1,user_agent='wswp',proxies=None,num_retries=1,scrape_callback=None,cache=None): #爬去 链接网址种子,紧跟的链接后面有link_regex crawl_queue=[seed_url] seen={seed_url:0} num_urls=0 rp=get_robots(seed_url) #D=Downloader(delay=delay,user_agent=user_agent,proxies=proxies,num_retries=num_retries,cache=cache) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) while crawl_queue: url=crawl_queue.pop() depth=seen[url] if rp.can_fetch(user_agent,url): html=D(url) links=[] if scrape_callback: links.extend(scrape_callback(url,html)or[]) if depth!=max_depth: if link_regex: links.extend(link for link in get_links(html)if re.match(link_regex,link)) for link in links: link=normalize(seed_url,link) if link not in seen: seen[link]=depth+1 if same_domain(seed_url,link): crawl_queue.append(link) num_urls+=1 if num_urls==max_urls: break else: print 'Block by robots.txt',url
def __init__(self, link, **kwargs): super(Crawler, self).__init__() self.lookup = {} self.q = Queue() self.feed = link.replace('https://', 'http://') if self.feed.endswith('/'): self.feed = self.feed[0:len(self.feed) - 1] self.downloader = Downloader(verify=kwargs.get('verify', True)) link = link.replace('http://', '') link = link.replace('https://', '') self.path = kwargs.get( 'output_path', os.path.join( os.path.expanduser('~'), link.translate({ord(c): "" for c in "<>:/\\\"|?*"}) + '.txt')) self.links_count = 0 self.parser = Spider() log_level = kwargs.get('LOG', Crawler.INFO_LOG) if not os.path.exists(os.path.join(os.path.expanduser('~'), '.crawler')): os.makedirs(os.path.join(os.path.expanduser('~'), '.crawler')) logging.basicConfig(filename=os.path.join( os.path.expanduser('~'), '.crawler', link.translate({ord(c): "" for c in "<>:/\\\"|?*"}) + '.log'), format='%(asctime)s %(levelname)s %(message)s', level=log_level) logging.getLogger().addHandler(logging.StreamHandler())
def music(update: Update, context: CallbackContext): if len(context.args) == 0: msg = 'Send a message in format "/music link_to_youtube"' context.bot.send_message( chat_id=update.effective_chat.id, text= msg, reply_to_message_id=update.effective_message.message_id, parse_mode=ParseMode.HTML, ) return music_src = context.args[0] try: dl = Downloader(music_src) context.bot.send_audio(chat_id=update.effective_chat.id, audio=open(dl.song, 'rb'), performer=dl.author, title=dl.title, caption="@invisiblemusicbot") except Exception as e: context.bot.send_message( chat_id=update.effective_chat.id, text= str(e), parse_mode=ParseMode.HTML, )
def download_files(self, query, engine, headers, waittime, startpage, endpage, filetypes, scrape): # applicator = Applicator() downloader = Downloader() #Set search engine downloader.set_searchengine(engine) downloader.set_waittime(waittime) page = startpage link_list = [] scraped_html = "" #Scrape HTML from pages while page <= endpage: #build url from search engine and query url = downloader.build_url(query, page) #retreive html html = downloader.scrape_html(url, headers) scraped_html = scraped_html + html.text #increment page by 10 for google/scholar page += 1 # time.sleep(wait_time) #Get links from collected html link_list = downloader.scrape_links(scraped_html) #Filter links by filetype filter_list = downloader.filter_links(link_list, filetypes) #Download files from filtered links downloader.dl_links(filter_list)
def __init__(self, config): """Weibo类初始化""" self.config = config # change cookie from string to dict if type(self.config['cookie']) == type(u''): self.config['cookie'] = { t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } if type(self.config['user_id_list']) == type(u""): user_id_list = self.config['user_id_list'] if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.config['user_id_list'] = user_id_list with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] self.config['user_id_list'] = [ line.split(' ')[0] for line in lines if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() ] if type(self.config['since_date']) == type(0): self.config['since_date'] = str( date.today() - timedelta(self.config['since_date'])) self.validator = Validator(self.config) self.validator.validate() self.printer = Printer() self.writer = Writer(self.config) self.downloader = Downloader(self.config) self.parser = Parser(self.config)
def download_chapters(conf: config.Config) -> Optional[Tuple[int, int]]: print("Which chapters do you want to download?") chapter_start = utils.input_int("First chapter? ") chapter_end = utils.input_int( "Last chapter? ", minval=chapter_start, default=chapter_start ) chapters_on_disk = utils.get_chapters_on_disk(conf.book) chapters = list(range(chapter_start, chapter_end + 1)) if any(ch in chapters_on_disk for ch in chapters): if not utils.input_yes_no( "Some of these chapters are already on disk. Do you want to redownload them?" ): chapters = [ch for ch in chapters if ch not in chapters_on_disk] if not chapters: print("All chapters are already on disk. Bye.") return None try: result = Downloader(conf).download_chapters(chapters) if result: return chapter_start, chapter_end return None except: print() raise
async def main(): parser = argparse.ArgumentParser(prog='Optimize', description='Optimize passivbot config.') parser = add_argparse_args(parser) parser.add_argument( '-t', '--start', type=str, required=False, dest='starting_configs', default='none', help= 'start with given live configs. single json file or dir with multiple json files' ) args = parser.parse_args() config = await prep_config(args) if config['exchange'] == 'bybit' and not config['inverse']: print('bybit usdt linear backtesting not supported') return downloader = Downloader(config) print() for k in (keys := [ 'exchange', 'symbol', 'starting_balance', 'start_date', 'end_date', 'latency_simulation_ms', 'do_long', 'do_shrt', 'minimum_liquidation_distance', 'max_hrs_no_fills', 'max_hrs_no_fills_same_side', 'iters', 'n_particles' ]): if k in config: print(f"{k: <{max(map(len, keys)) + 2}} {config[k]}")
def __init__(self): reload(sys) sys.setdefaultencoding('UTF-8') self.title = 'Nada' self.model = ['luoo 落网', 'echo 回声', 'nada 收藏', '关于'] self.view = 'menu' self.ctrl = 'menu' self.offset = 0 self.index = 0 self.step = 10 self.play_id = -1 self.play_vol = -1 self.present = [] self.stack = [] self.player = Player() self.ui = UI() self.luoo = Luoo() self.echo = Echo() self.downloader = Downloader() self.database = Database() self.database.load() self.collections = self.database.data['collections'][0] self.screen = curses.initscr() self.screen.keypad(1)
async def async_run(self, event): tag = self.tag_name.GetValue() dir_name = self.dir_name.GetValue() port = self.proxy.GetValue() begin = int(self.begin.GetValue()) end = int(self.end.GetValue()) conn = int(self.conn.GetValue()) dw = int(self.download.GetValue()) if not all([tag, dir_name, begin, end, conn, dw]): print('参数有误或未填写完整') else: self.download_button.Disable() proxy = None if port: proxy = f'http://127.0.0.1:{port}' downloader = Downloader(tag, dir_name, begin, end, proxy=proxy, max_conn_num=conn, max_download_num=dw) await downloader.start() self.download_button.Enable()
def __init__(self, start_url, subdomains): self.start_req = Request('get', start_url, '/') self.scheduler = Scheduler(subdomains) self.spider = Spider() self.downloader = Downloader() # 加入初始请求 self.scheduler.put_request(self.start_req)