Exemple #1
0
class Gui:
    def __init__(self):
        """ Download images and saves them into the folder images 
            shows them one by one in a tkinter window and then
            delete the folder after all images has been shown. """

        self.window = tk.Tk()
        pad = 3
        self.window.geometry("{0}x{1}+0+0".format(
            self.window.winfo_screenwidth() - pad,
            self.window.winfo_screenheight() -
            pad))  #Sets the window fullscreen
        self.label = tk.Label()
        self.label.pack()

        self.Reddit = Reddit()
        self.Download = Download()
        links = self.Reddit.reddit()
        for link in links:
            self.Download.download(link)

        files = os.listdir(
            "images")  # Creates an array with all filenames in images map

        counter = 0
        for file in files:
            if counter != 0: time.sleep(10)
            photo = ImageTk.PhotoImage(Image.open("images/{}".format(file)))
            self.label.configure(image=photo)
            self.label.image = photo
            self.window.update_idletasks()
            counter = 1

        self.Download.delete_folder()
Exemple #2
0
class Gui:
    def __init__(self):
    
        """ Download images and saves them into the folder images 
            shows them one by one in a tkinter window and then
            delete the folder after all images has been shown. """
    
        self.window = tk.Tk()
        pad = 3
        self.window.geometry("{0}x{1}+0+0".format(self.window.winfo_screenwidth()-pad, 
                    self.window.winfo_screenheight()-pad)) #Sets the window fullscreen
        self.label = tk.Label() 
        self.label.pack()            
        
        self.Reddit = Reddit()
        self.Download = Download()
        links = self.Reddit.reddit()
        for link in links:
            self.Download.download(link)
            
        files = os.listdir("images") # Creates an array with all filenames in images map

        
        counter = 0
        for file in files:
            if counter != 0 : time.sleep(10)
            photo = ImageTk.PhotoImage(Image.open("images/{}".format(file)))
            self.label.configure(image=photo)
            self.label.image = photo 
            self.window.update_idletasks()
            counter = 1
            
        self.Download.delete_folder()
    def btn_start_clicked(self):
        self.progressbar.setValue(0)
        self.data = []
        for row in range(0, self.tw_monitor.rowCount()):
            self.coloum_data = []
            if self.tw_monitor.item(row, 6).text() == "False":
                self.coloum_data.append(row)
                for column in range(0, self.tw_monitor.columnCount() - 1):
                    self.coloum_data.append(self.tw_monitor.item(row, column).text())
                self.data.append(self.coloum_data)

        print(self.data)

        if not self.data:
            pass
        else:
            self.progress = 0
            for row in self.data:

                dl = Download()

                dl.link = row[1]
                if row[2] == "mp3":
                    dl.format = "bestaudio/best"
                elif row[2] == "mp4":
                    dl.format = "best/best"

                dl.output_path = row[3]
                if row[4] == "":
                    dl.is_custom_name = False
                    dl.custom_name = ""
                else:
                    dl.is_custom_name = True
                    dl.custom_name = row[4]
                if row[5] == "True":
                    dl.thumbnail = True
                else:
                    dl.thumbnail = False
                if row[6] == "True":
                    dl.subtitle = True
                else:
                    dl.subtitle = False

                try:
                    dl.download()
                    self.tw_monitor.takeItem(row[0], 6)
                except:
                    self.tw_monitor.setItem(row[0], 6, QTableWidgetItem("Error"))
                else:
                    self.tw_monitor.setItem(row[0], 6, QTableWidgetItem("True"))

                print(int(100 * (self.progress / len(self.data))))
                self.progressbar.setValue(int(100 * (self.progress / len(self.data))))
                self.progress += 1

            self.progressbar.setValue(100)
Exemple #4
0
class Spider_Music():
	
	def __init__(self):
		self.download = Download()
		self.url_manager = Url_Manager()
		self.html_parser = Html_Parser()
		self.save = Save()
		self.set_color = Set_Color()
		
	def craw(self,url):
		self.url_manager.addurl({'url':url,'name':'temp'})
	
		while self.url_manager.checknewurllength>0:
			newurl = self.url_manager.geturl()
			
			if self.save.checkfile(newurl['name']):
				self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name']))
				continue
			
			print("开始下载 {} {}".format(newurl['name'],newurl['url']))
			htmlcontent = self.download.download(newurl['url'])			
			newurls,result = self.html_parser.parser(htmlcontent)
			
			self.url_manager.addurls(newurls)			
			self.save.save(result,newurl['name'])
			print("下载完成 {} ".format(newurl['name']))
		print("共下载{}首歌曲".format(self.save.count))
		
	def main(self):
		self.craw('https://music.163.com/#/playlist?id=2492536378')
Exemple #5
0
    def fetch_cities_in_state(self):
        for alpha in self._alpha:
            url = State.url_prefix + self._abbrv + "/" + alpha
            dest = State.file_prefix + self._abbrv + "/" + alpha
            print "downloading state (%s), url (%s), state (%s)" % (self._state, url, dest)
            dir = Dir(dest)
            if dir.exists() is False:
                dir.create_if_needed()

            # check if data is present
            data_file = File(dest + "/file")
            if data_file.exists() is True:
                print "data present for state %s, %s" % (self._state, alpha)
                continue

            download = Download(url, dest + "/file")
            download.download()
Exemple #6
0
    def fetch_cities_in_state(self):
        for alpha in self._alpha:
            url = State.url_prefix + self._abbrv + '/' + alpha
            dest = State.file_prefix + self._abbrv + '/' + alpha
            print 'downloading state (%s), url (%s), state (%s)' % (
                self._state, url, dest)
            dir = Dir(dest)
            if dir.exists() is False:
                dir.create_if_needed()

            # check if data is present
            data_file = File(dest + '/file')
            if data_file.exists() is True:
                print 'data present for state %s, %s' % (self._state, alpha)
                continue

            download = Download(url, dest + '/file')
            download.download()
Exemple #7
0
    def _install(self, target):
        self.path = ""
        if target.platform == "Windows":
            architecture_string = ""
            if target.architecture == "64":
                architecture_string = "-win64-x64"
            else:
                architecture_string = "-win32-x86"
            source_path = "CMake/cmake-3.12.3" + architecture_string + ".zip"
            zip_ref = zipfile.ZipFile(source_path, "r")
            self.path = "Build/cmake-3.12.3" + architecture_string + \
                        "/bin/cmake.exe"

            # TODO : the path we delete here doesn't seem right
            shutil.rmtree(self.path, ignore_errors=True)
            zip_ref.extractall("Build")
            zip_ref.close()
        elif target.platform == "Linux":
            download_url = "https://github.com/CodeSmithyIDE/CMake/archive/master.zip"
            download = Download("CMake", download_url, "Build")
            download.download(None)
            download.unzip()
            previous_working_dir = os.getcwd()
            os.chdir("Build/CMake")
            try:
                try:
                    subprocess.check_call(["chmod", "0774", "bootstrap"])
                except subprocess.CalledProcessError:
                    raise RuntimeError("chmod 0774 bootstrap failed.")
                try:
                    subprocess.check_call("./bootstrap")
                except subprocess.CalledProcessError:
                    raise RuntimeError("./bootstrap failed.")
                GNUmake().compile("Makefile", None, None)
                self.path = "Build/CMake/bin/cmake"
            finally:
                os.chdir(previous_working_dir)
        else:
            raise RuntimeError("Unsupported platform: " + target.platform)
Exemple #8
0
def download():
    data = json.loads(request.get_data(as_text=True))
    url_id = data['url_id']
    name = data['name']
    author = data['author']
    source = data['source']
    d = Download(name, author, url_id, source)
    is_ok = d.download()
    if is_ok:
        ret = {'state': 1}
    else:
        ret = {'state': 0}
    return jsonify(ret)
    def download_song(self, song_id, path='.', id=0, ids=0):
        # Cannot Find Or download This song
        self.get_real_song_data(song_id)
        mp3Name = "{songName}--{author}.{format}".format(
            songName = self.__SONG_NAME,
            author = self.__SONG_AUTHOR,
            format = self.__SONG_FORMAT,
            ).strip()

        download_flag = (0, 0, 0)
        if not self.__SONG_REAL_URL:
            print("No valid Url.")
        else:
            download = Download()
            download_flag = download.download(self.__SONG_REAL_URL, 
                                              mp3Name, path, id, ids)
        return download_flag
Exemple #10
0
def execute_download(table, queue, logger, is_test=False):
    '''
    核心下载函数
    params: table -> class Table的实例
    params: queue -> book task queue
    params: logger -> Log().Logger
    '''

    data = queue.get(block=True, timeout=30)
    category = data.get('category')
    book = data.get('book')
    is_finish = data.get('is_finish')
    id = data.get('id')
    item = data.get('item')
    url = data.get('url')

    if is_finish == 1:
        folder = u'完结'
    else:
        folder = u'连载'

    filefolder = u'%s/%s/%s/%s' % (DOWNLOAD_FOLDER, folder, category, book)

    if not os.path.exists(filefolder):
        os.makedirs(filefolder)
        message = u'makedirs %s' % (filefolder)
        logger.info(message)

    filename = u'%d-%s.txt' % (id, item)
    filepath = u'%s/%s' % (filefolder, filename)

    download = Download(url=url, logger=logger, filepath=filepath)

    try:
        flag = download.download()

    except Exception, e:

        message = u'catch Exception:%s when execute download,put data:%s back to queue' % (
            e, json.dumps(data, ensure_ascii=False))
        table.logger.error(message)
        queue.put(data)
        flag = False
Exemple #11
0
class SpiderImages():

    #init all instance
    def __init__(self):
        self.download = Download()
        self.htmlparser = HtmlParser()
        self.urlmanager = UrlManger()
        self.saveresult = SaveResult()

    def run(self, urls):
        i = 1
        for url in urls:
            #ile_dir = url.split('/')[-1]
            self.urlmanager.add_new_url(url)

            while self.urlmanager.has_new_url():

                new_url = self.urlmanager.get_new_url()

                html_cont = self.download.download(new_url)

                new_urls, name, html_cont, t = self.htmlparser.parser(
                    html_cont)
                #print(name)

                self.urlmanager.add_new_urls(new_urls)

                self.saveresult.save(html_cont, name, t)
                print("{} {}".format(i, new_url))
                if i == 100:
                    break
                i += 1

    def main(self, url):
        #self.craw(url)
        self.run(url)
Exemple #12
0
class Crawler(object):
    def __init__(self, seed_id):
        # 初始化抓取任务
        self.task = CrawlTask(seed_id)
        self.save = CrawlerSave(self.task)
        # 初始化下载下载接口
        self.download = Download(self.task)

    def run_page_actions(self):
        seed_init = self.task.seed_init
        for url in self.task.init_urls:
            print('init_url:%s' % url)
            status, text = self.download.download(self.task, url)
            print('run_page_actions  status:%s' % status)

            # # 首先提取上一页应该提取的字段
            # extract_fields = seed_init.get('extract_fields')
            # #
            #
            # # 开始处理对页面的请求,提取元素进入下一页 or 直接提取数据
            # page_actions = self.task.seed_init.get('add_pages')
            # for action in page_actions:
            #     print('URL:%s  action:%s' % (url, action))
            #     self.run_extract_action(action, text, url)
            self.page_parse(seed_init, url, text)

    def page_parse(self, action, url, text, **kwargs):
        # 首先提取上一页应该提取的字段
        extract_fields = action.get('extract_fields')
        type = extract_fields.get('type')
        if type == 'saveall':
            self.save.save(url, text, '')
        #

        # 开始处理对页面的请求,提取元素进入下一页 or 直接提取数据
        page_actions = action.get('add_pages', [])
        for action in page_actions:
            print('URL:%s  action:%s' % (url, action))
            self.run_extract_action(action, text, url)

    def run_extract_action(self, action, text, url, **kwargs):
        print('action name:%s' % action.get('action_name'))
        next = action.get('next', {})
        # 提取元素进入下一页
        type = next.get('type', None)
        fields = next.get('fields', None)
        next_url_list = []  # 返回一个 URL 列表,这里目前只支持一个字段
        if type == 'xpath':
            next_url_list = self.extract_with_xpath(action, text, url,
                                                    **kwargs)
        elif type == 'json':
            next_url_list = self.extract_with_json(action, text, url, **kwargs)
        elif type == 're':
            next_url_list = self.extract_with_re(action, text, url, **kwargs)
        elif type == 'pyfunc':
            pass

        print('next_url_list:%s' % next_url_list)
        for next_url in next_url_list:
            url_regex = next.get('url_regex')
            # 过滤掉不相关的 URL,这里使用正则
            if self.url_filter(url_regex, next_url) is None:
                continue

            print('action:%s next_url:%s' %
                  (action.get('action_name'), next_url))
            status, text = self.download.download(self.task, next_url)
            self.page_parse(action, next_url, text)

    def extract_url(self, action, text, url, **kwargs):
        next = action.get('next', {})
        type = next.get('type', None)
        fields = next.get('fields', None)
        next_url_list = []  # 返回一个 URL 列表,这里目前只支持一个字段
        if type == 'xpath':
            next_url_list = self.extract_with_xpath(action, text, url,
                                                    **kwargs)
        elif type == 'json':
            next_url_list = self.extract_with_json(action, text, url, **kwargs)
        elif type == 're':
            next_url_list = self.extract_with_re(action, text, url, **kwargs)
        elif type == 'pyfunc':
            pass
        return next_url_list

    # 通过 xpath 解析数据
    def extract_with_xpath(self, action, text, url, **kwargs):
        body = fromstring(text)
        next = action.get('next', {})
        fields = next.get('fields', {})  # TODO...
        fields_dict = json.loads(fields)
        url_list = []
        for key, field_value in fields_dict.items():
            print('field_value:%s' % field_value)
            url_temp = next.get('url_temp')
            print('url_temp:%s' % url_temp)
            results = body.xpath(key, smart_strings=False)
            for res in results:
                template = Template(url_temp)
                url = template.render(**{field_value: res})
                url_list.append(url)
        return url_list

    # 解析 json 拿到下一个入口
    def extract_with_json(self, action, text, url, **kwargs):
        body = json.loads(text)
        next = action.get('next', {})
        fields = next.get('fields', {})
        next_bodys = []
        next_bodys.append(body)
        fields_dict = json.loads(fields)
        results = {}  # 查找输出结果,这里先不要考虑太复杂
        self.get_fields(body, fields_dict, results)
        print('results:%s' % results)
        template = Template(next.get('url_temp'))
        min_len = sys.maxsize
        key_list = results.keys()  # TODO... 这里应该定向获取到某一个对象的长度
        for key, value_list in results.items():
            min_len = min(min_len, len(value_list))

        url_list = []
        for i in range(0, min_len):
            kwargs = {}
            for key in key_list:
                kwargs[key] = results[key][i]
            url = template.render(kwargs)
            url_list.append(url)
        return url_list

    # key 始终是要获取的值
    # value 命名
    # body json
    # field 要采集的数据的 缩小版 json 格式
    # results 采集到的数据最终会存储在 results
    def get_fields(self, body, field, results):
        def get_field():
            if isinstance(field_value, list):
                for value in field_value:
                    self.get_fields(new_body, value, results)
            elif isinstance(field_value, dict):
                self.get_fields(new_body, field_value, results)
            elif isinstance(field_value, str):
                if field_value not in results:
                    results[field_value] = []
                    results[field_value].append(new_body)
                else:
                    results[field_value].append(new_body)

        if isinstance(field, list):
            for val in field:
                for key, field_value in val.items():
                    new_body = body.get(key)
                    get_field()
        elif isinstance(field, dict):
            for key, field_value in field.items():
                if isinstance(body, dict):
                    new_body = body.get(key)
                    get_field()
                elif isinstance(body, list):
                    for item in body:
                        new_body = item.get(key)
                        get_field()

    # 通过正则拿到下一个入口
    def extract_with_re(self, action, text, url, **kwargs):
        pass

    def url_filter(self, next_url_regex, next_url):
        search = re.search(next_url_regex, next_url, re.I)
        if search:
            return next_url
        else:
            return None
            #     link.link = "None"
            #
            #     f = open("log.txt", "a")
            #
            #     f.write(str(link.version)+"\n")
            #     f.write(str(link.downloads_total)+"\n")
            #     f.write(str(link.downloads_last_week)+"\n")
            #     f.write(str(link.info_link)+"\n\n")
            #
            #     f.close()
            #
            #     flag = False

            if flag:

                h, name, path, save = Download.download(
                    crawlers[option - 1], link.link)

                # if h is not False and save is not False:
                #
                #     print("SUCESS")
                #
                #     # TODO -> PROBABLE INFINIT LOOP... MUST CHECK WHY!
                #
                #     data = []
                #     # name, download_link, download_date, origin_website, total_downloads, last_week_downloads, version, hash
                #     data = (name, link.link, time.strftime("%d/%m/%Y - %H:%M:%S"), crawlers[option-1], link.downloads_total,
                #             link.downloads_last_week, link.version, h, path)
                #
                #     DB.insert(data)
                #
                #     number += 1
Exemple #14
0
###################################################################
#
# Copyright (C) 2020 Shubhrendu Tripathi
#
# GPL v3 License
#
###################################################################

#!/usr/bin/env python3

import gi
gi.require_version("Gtk", "3.0")
from gi.repository import Gtk
from download import Download
from ui import UI
from si import SI

url_c19s = "https://api.covid19india.org/csv/latest/state_wise.csv"

if __name__ == "__main__":
    """Covid-19 Statistics"""
    UI()
    SI()

    Gtk.main()

    Download.download(url_c19s)