class Gui: def __init__(self): """ Download images and saves them into the folder images shows them one by one in a tkinter window and then delete the folder after all images has been shown. """ self.window = tk.Tk() pad = 3 self.window.geometry("{0}x{1}+0+0".format( self.window.winfo_screenwidth() - pad, self.window.winfo_screenheight() - pad)) #Sets the window fullscreen self.label = tk.Label() self.label.pack() self.Reddit = Reddit() self.Download = Download() links = self.Reddit.reddit() for link in links: self.Download.download(link) files = os.listdir( "images") # Creates an array with all filenames in images map counter = 0 for file in files: if counter != 0: time.sleep(10) photo = ImageTk.PhotoImage(Image.open("images/{}".format(file))) self.label.configure(image=photo) self.label.image = photo self.window.update_idletasks() counter = 1 self.Download.delete_folder()
class Gui: def __init__(self): """ Download images and saves them into the folder images shows them one by one in a tkinter window and then delete the folder after all images has been shown. """ self.window = tk.Tk() pad = 3 self.window.geometry("{0}x{1}+0+0".format(self.window.winfo_screenwidth()-pad, self.window.winfo_screenheight()-pad)) #Sets the window fullscreen self.label = tk.Label() self.label.pack() self.Reddit = Reddit() self.Download = Download() links = self.Reddit.reddit() for link in links: self.Download.download(link) files = os.listdir("images") # Creates an array with all filenames in images map counter = 0 for file in files: if counter != 0 : time.sleep(10) photo = ImageTk.PhotoImage(Image.open("images/{}".format(file))) self.label.configure(image=photo) self.label.image = photo self.window.update_idletasks() counter = 1 self.Download.delete_folder()
def btn_start_clicked(self): self.progressbar.setValue(0) self.data = [] for row in range(0, self.tw_monitor.rowCount()): self.coloum_data = [] if self.tw_monitor.item(row, 6).text() == "False": self.coloum_data.append(row) for column in range(0, self.tw_monitor.columnCount() - 1): self.coloum_data.append(self.tw_monitor.item(row, column).text()) self.data.append(self.coloum_data) print(self.data) if not self.data: pass else: self.progress = 0 for row in self.data: dl = Download() dl.link = row[1] if row[2] == "mp3": dl.format = "bestaudio/best" elif row[2] == "mp4": dl.format = "best/best" dl.output_path = row[3] if row[4] == "": dl.is_custom_name = False dl.custom_name = "" else: dl.is_custom_name = True dl.custom_name = row[4] if row[5] == "True": dl.thumbnail = True else: dl.thumbnail = False if row[6] == "True": dl.subtitle = True else: dl.subtitle = False try: dl.download() self.tw_monitor.takeItem(row[0], 6) except: self.tw_monitor.setItem(row[0], 6, QTableWidgetItem("Error")) else: self.tw_monitor.setItem(row[0], 6, QTableWidgetItem("True")) print(int(100 * (self.progress / len(self.data)))) self.progressbar.setValue(int(100 * (self.progress / len(self.data)))) self.progress += 1 self.progressbar.setValue(100)
class Spider_Music(): def __init__(self): self.download = Download() self.url_manager = Url_Manager() self.html_parser = Html_Parser() self.save = Save() self.set_color = Set_Color() def craw(self,url): self.url_manager.addurl({'url':url,'name':'temp'}) while self.url_manager.checknewurllength>0: newurl = self.url_manager.geturl() if self.save.checkfile(newurl['name']): self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name'])) continue print("开始下载 {} {}".format(newurl['name'],newurl['url'])) htmlcontent = self.download.download(newurl['url']) newurls,result = self.html_parser.parser(htmlcontent) self.url_manager.addurls(newurls) self.save.save(result,newurl['name']) print("下载完成 {} ".format(newurl['name'])) print("共下载{}首歌曲".format(self.save.count)) def main(self): self.craw('https://music.163.com/#/playlist?id=2492536378')
def fetch_cities_in_state(self): for alpha in self._alpha: url = State.url_prefix + self._abbrv + "/" + alpha dest = State.file_prefix + self._abbrv + "/" + alpha print "downloading state (%s), url (%s), state (%s)" % (self._state, url, dest) dir = Dir(dest) if dir.exists() is False: dir.create_if_needed() # check if data is present data_file = File(dest + "/file") if data_file.exists() is True: print "data present for state %s, %s" % (self._state, alpha) continue download = Download(url, dest + "/file") download.download()
def fetch_cities_in_state(self): for alpha in self._alpha: url = State.url_prefix + self._abbrv + '/' + alpha dest = State.file_prefix + self._abbrv + '/' + alpha print 'downloading state (%s), url (%s), state (%s)' % ( self._state, url, dest) dir = Dir(dest) if dir.exists() is False: dir.create_if_needed() # check if data is present data_file = File(dest + '/file') if data_file.exists() is True: print 'data present for state %s, %s' % (self._state, alpha) continue download = Download(url, dest + '/file') download.download()
def _install(self, target): self.path = "" if target.platform == "Windows": architecture_string = "" if target.architecture == "64": architecture_string = "-win64-x64" else: architecture_string = "-win32-x86" source_path = "CMake/cmake-3.12.3" + architecture_string + ".zip" zip_ref = zipfile.ZipFile(source_path, "r") self.path = "Build/cmake-3.12.3" + architecture_string + \ "/bin/cmake.exe" # TODO : the path we delete here doesn't seem right shutil.rmtree(self.path, ignore_errors=True) zip_ref.extractall("Build") zip_ref.close() elif target.platform == "Linux": download_url = "https://github.com/CodeSmithyIDE/CMake/archive/master.zip" download = Download("CMake", download_url, "Build") download.download(None) download.unzip() previous_working_dir = os.getcwd() os.chdir("Build/CMake") try: try: subprocess.check_call(["chmod", "0774", "bootstrap"]) except subprocess.CalledProcessError: raise RuntimeError("chmod 0774 bootstrap failed.") try: subprocess.check_call("./bootstrap") except subprocess.CalledProcessError: raise RuntimeError("./bootstrap failed.") GNUmake().compile("Makefile", None, None) self.path = "Build/CMake/bin/cmake" finally: os.chdir(previous_working_dir) else: raise RuntimeError("Unsupported platform: " + target.platform)
def download(): data = json.loads(request.get_data(as_text=True)) url_id = data['url_id'] name = data['name'] author = data['author'] source = data['source'] d = Download(name, author, url_id, source) is_ok = d.download() if is_ok: ret = {'state': 1} else: ret = {'state': 0} return jsonify(ret)
def download_song(self, song_id, path='.', id=0, ids=0): # Cannot Find Or download This song self.get_real_song_data(song_id) mp3Name = "{songName}--{author}.{format}".format( songName = self.__SONG_NAME, author = self.__SONG_AUTHOR, format = self.__SONG_FORMAT, ).strip() download_flag = (0, 0, 0) if not self.__SONG_REAL_URL: print("No valid Url.") else: download = Download() download_flag = download.download(self.__SONG_REAL_URL, mp3Name, path, id, ids) return download_flag
def execute_download(table, queue, logger, is_test=False): ''' 核心下载函数 params: table -> class Table的实例 params: queue -> book task queue params: logger -> Log().Logger ''' data = queue.get(block=True, timeout=30) category = data.get('category') book = data.get('book') is_finish = data.get('is_finish') id = data.get('id') item = data.get('item') url = data.get('url') if is_finish == 1: folder = u'完结' else: folder = u'连载' filefolder = u'%s/%s/%s/%s' % (DOWNLOAD_FOLDER, folder, category, book) if not os.path.exists(filefolder): os.makedirs(filefolder) message = u'makedirs %s' % (filefolder) logger.info(message) filename = u'%d-%s.txt' % (id, item) filepath = u'%s/%s' % (filefolder, filename) download = Download(url=url, logger=logger, filepath=filepath) try: flag = download.download() except Exception, e: message = u'catch Exception:%s when execute download,put data:%s back to queue' % ( e, json.dumps(data, ensure_ascii=False)) table.logger.error(message) queue.put(data) flag = False
class SpiderImages(): #init all instance def __init__(self): self.download = Download() self.htmlparser = HtmlParser() self.urlmanager = UrlManger() self.saveresult = SaveResult() def run(self, urls): i = 1 for url in urls: #ile_dir = url.split('/')[-1] self.urlmanager.add_new_url(url) while self.urlmanager.has_new_url(): new_url = self.urlmanager.get_new_url() html_cont = self.download.download(new_url) new_urls, name, html_cont, t = self.htmlparser.parser( html_cont) #print(name) self.urlmanager.add_new_urls(new_urls) self.saveresult.save(html_cont, name, t) print("{} {}".format(i, new_url)) if i == 100: break i += 1 def main(self, url): #self.craw(url) self.run(url)
class Crawler(object): def __init__(self, seed_id): # 初始化抓取任务 self.task = CrawlTask(seed_id) self.save = CrawlerSave(self.task) # 初始化下载下载接口 self.download = Download(self.task) def run_page_actions(self): seed_init = self.task.seed_init for url in self.task.init_urls: print('init_url:%s' % url) status, text = self.download.download(self.task, url) print('run_page_actions status:%s' % status) # # 首先提取上一页应该提取的字段 # extract_fields = seed_init.get('extract_fields') # # # # # 开始处理对页面的请求,提取元素进入下一页 or 直接提取数据 # page_actions = self.task.seed_init.get('add_pages') # for action in page_actions: # print('URL:%s action:%s' % (url, action)) # self.run_extract_action(action, text, url) self.page_parse(seed_init, url, text) def page_parse(self, action, url, text, **kwargs): # 首先提取上一页应该提取的字段 extract_fields = action.get('extract_fields') type = extract_fields.get('type') if type == 'saveall': self.save.save(url, text, '') # # 开始处理对页面的请求,提取元素进入下一页 or 直接提取数据 page_actions = action.get('add_pages', []) for action in page_actions: print('URL:%s action:%s' % (url, action)) self.run_extract_action(action, text, url) def run_extract_action(self, action, text, url, **kwargs): print('action name:%s' % action.get('action_name')) next = action.get('next', {}) # 提取元素进入下一页 type = next.get('type', None) fields = next.get('fields', None) next_url_list = [] # 返回一个 URL 列表,这里目前只支持一个字段 if type == 'xpath': next_url_list = self.extract_with_xpath(action, text, url, **kwargs) elif type == 'json': next_url_list = self.extract_with_json(action, text, url, **kwargs) elif type == 're': next_url_list = self.extract_with_re(action, text, url, **kwargs) elif type == 'pyfunc': pass print('next_url_list:%s' % next_url_list) for next_url in next_url_list: url_regex = next.get('url_regex') # 过滤掉不相关的 URL,这里使用正则 if self.url_filter(url_regex, next_url) is None: continue print('action:%s next_url:%s' % (action.get('action_name'), next_url)) status, text = self.download.download(self.task, next_url) self.page_parse(action, next_url, text) def extract_url(self, action, text, url, **kwargs): next = action.get('next', {}) type = next.get('type', None) fields = next.get('fields', None) next_url_list = [] # 返回一个 URL 列表,这里目前只支持一个字段 if type == 'xpath': next_url_list = self.extract_with_xpath(action, text, url, **kwargs) elif type == 'json': next_url_list = self.extract_with_json(action, text, url, **kwargs) elif type == 're': next_url_list = self.extract_with_re(action, text, url, **kwargs) elif type == 'pyfunc': pass return next_url_list # 通过 xpath 解析数据 def extract_with_xpath(self, action, text, url, **kwargs): body = fromstring(text) next = action.get('next', {}) fields = next.get('fields', {}) # TODO... fields_dict = json.loads(fields) url_list = [] for key, field_value in fields_dict.items(): print('field_value:%s' % field_value) url_temp = next.get('url_temp') print('url_temp:%s' % url_temp) results = body.xpath(key, smart_strings=False) for res in results: template = Template(url_temp) url = template.render(**{field_value: res}) url_list.append(url) return url_list # 解析 json 拿到下一个入口 def extract_with_json(self, action, text, url, **kwargs): body = json.loads(text) next = action.get('next', {}) fields = next.get('fields', {}) next_bodys = [] next_bodys.append(body) fields_dict = json.loads(fields) results = {} # 查找输出结果,这里先不要考虑太复杂 self.get_fields(body, fields_dict, results) print('results:%s' % results) template = Template(next.get('url_temp')) min_len = sys.maxsize key_list = results.keys() # TODO... 这里应该定向获取到某一个对象的长度 for key, value_list in results.items(): min_len = min(min_len, len(value_list)) url_list = [] for i in range(0, min_len): kwargs = {} for key in key_list: kwargs[key] = results[key][i] url = template.render(kwargs) url_list.append(url) return url_list # key 始终是要获取的值 # value 命名 # body json # field 要采集的数据的 缩小版 json 格式 # results 采集到的数据最终会存储在 results def get_fields(self, body, field, results): def get_field(): if isinstance(field_value, list): for value in field_value: self.get_fields(new_body, value, results) elif isinstance(field_value, dict): self.get_fields(new_body, field_value, results) elif isinstance(field_value, str): if field_value not in results: results[field_value] = [] results[field_value].append(new_body) else: results[field_value].append(new_body) if isinstance(field, list): for val in field: for key, field_value in val.items(): new_body = body.get(key) get_field() elif isinstance(field, dict): for key, field_value in field.items(): if isinstance(body, dict): new_body = body.get(key) get_field() elif isinstance(body, list): for item in body: new_body = item.get(key) get_field() # 通过正则拿到下一个入口 def extract_with_re(self, action, text, url, **kwargs): pass def url_filter(self, next_url_regex, next_url): search = re.search(next_url_regex, next_url, re.I) if search: return next_url else: return None
# link.link = "None" # # f = open("log.txt", "a") # # f.write(str(link.version)+"\n") # f.write(str(link.downloads_total)+"\n") # f.write(str(link.downloads_last_week)+"\n") # f.write(str(link.info_link)+"\n\n") # # f.close() # # flag = False if flag: h, name, path, save = Download.download( crawlers[option - 1], link.link) # if h is not False and save is not False: # # print("SUCESS") # # # TODO -> PROBABLE INFINIT LOOP... MUST CHECK WHY! # # data = [] # # name, download_link, download_date, origin_website, total_downloads, last_week_downloads, version, hash # data = (name, link.link, time.strftime("%d/%m/%Y - %H:%M:%S"), crawlers[option-1], link.downloads_total, # link.downloads_last_week, link.version, h, path) # # DB.insert(data) # # number += 1
################################################################### # # Copyright (C) 2020 Shubhrendu Tripathi # # GPL v3 License # ################################################################### #!/usr/bin/env python3 import gi gi.require_version("Gtk", "3.0") from gi.repository import Gtk from download import Download from ui import UI from si import SI url_c19s = "https://api.covid19india.org/csv/latest/state_wise.csv" if __name__ == "__main__": """Covid-19 Statistics""" UI() SI() Gtk.main() Download.download(url_c19s)