コード例 #1
0
ファイル: mv_list.py プロジェクト: guaidashu/movie_spider
class MvList(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    @classmethod
    def run(cls):
        """
        start get movie list
        :return:
        """
        mv_list_thread = MvListThread()
        mv_list_thread.run()

    def get_mv_list(self):
        """
        :return:
        """
        select_arr = {
            "table": "list"
        }
        mv_list = self.db.select(select_arr, is_close_db=False)
        return mv_list
コード例 #2
0
ファイル: mv_img.py プロジェクト: guaidashu/movie_spider
class MvImg(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.handle_data()

    def handle_data(self):
        data = self.get_img_list()
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        for item in data:
            task = thread_pool.submit(self.__handle_data, item)
            task_list.append(task)
            # break
        for i in as_completed(task_list):
            result = i.result()

    def __handle_data(self, item):
        url = "https://www.pelisplay.tv" + item['img_src']
        header = {
            # "Referer": "https://www.pelisplay.tv/",
            "User-Agent": getUserAgent(),
            "Accept": "image/webp,image/apng,image/*,*/*;q=0.8"
        }
        data = curlData(url, header=header)
        with open("static/images/{id}.jpg".format(id=item['id']), "wb") as f:
            try:
                data = data.encode("utf-8")
            except Exception as e:
                debug(e)
            f.write(data)
            self.__update_data(item)
            f.close()
        return {"code": 0}

    def __update_data(self, item):
        update_arr = {
            "table": "list",
            "set": {
                "img_status": 1
            },
            "condition": ['id={id}'.format(id=item['id'])]
        }
        lock.acquire()
        result = self.db.update(update_arr, is_close_db=False)
        lock.release()
        return result

    def get_img_list(self):
        select_arr = {
            "table": "list",
            "columns": ["id", "img_src"],
            "condition": ['img_status=0']
        }
        data = self.db.select(select_arr, is_close_db=False)
        return data
コード例 #3
0
class HandleData(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def handle_category(self):
        category_li = self.__handle_category()
        self.__handle_category_data(category_li)

    @classmethod
    def __handle_category(cls):
        with open("tmp/category_data.txt", "rb") as f:
            page_resource = f.read().decode("utf-8")
            f.close()
        bs_data = BeautifulSoup(page_resource, "html.parser")
        category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"})
        # only get the next level's li(tag), not include offspring(need to add 'recursive=False')
        return category_ul[0].find_all("li", recursive=False)

    def __handle_category_data(self, category_li, handle_type=1, parent_id=0):
        table_columns = (("id", "int"), ("name", "varchar"),
                         ("page_num", "longtext"), ("nav_type", "int"),
                         ("keyword", "varchar"), ("parent_id", "int"))
        for item in category_li:
            insert_arr = {"parent_id": 0, "nav_type": 2}
            try:
                href = item.find("a").attrs['href']
                try:
                    insert_arr['keyword'] = re.findall('category=([\w\W]*.)',
                                                       href)[0]
                except Exception as e:
                    debug(e)
                if handle_type == 2:
                    insert_arr['parent_id'] = parent_id
                if href == "#":
                    insert_arr['name'] = item.find("span").getText().strip()
                    insert_arr['nav_type'] = 1
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    lastest_id = self.db.insertLastId(sql, is_close_db=False)
                    if lastest_id == 0:
                        debug("get data error")
                        continue
                    self.__handle_category_data(item.find_all("li"), 2,
                                                lastest_id)
                else:
                    insert_arr['name'] = item.getText().strip()
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    self.db.insert(sql, is_close_db=False)
            except Exception as e:
                debug(e)
コード例 #4
0
class DbTest(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def get_columns(self):
        data = {"name": "test"}
        sql = self.db.getInsertSql(data, "type")
        debug(sql)
コード例 #5
0
ファイル: recipe_content.py プロジェクト: guaidashu/recipe
class RecipeContent(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    @classmethod
    def run(cls):
        recipe_list_thread = RecipeContentThread()
        recipe_list_thread.run()
コード例 #6
0
ファイル: mv_test.py プロジェクト: guaidashu/movie_spider
class MvTest(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        pass

    def get_category(self):
        pass
コード例 #7
0
ファイル: mv_content.py プロジェクト: guaidashu/movie_spider
class MvContent(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    @classmethod
    def run(cls):
        """
        :return:
        """
        mv_content_thread = MvContentThread()
        mv_content_thread.run()
コード例 #8
0
class ClearNullData(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.handle()

    def handle(self):
        data = self.get_data()
        debug(data)
        self.del_list(data)
        self.del_content(data)

    def del_list(self, data):
        for item in data:
            self.__del_list(item)

    def __del_list(self, item):
        delete_arr = {
            "table": "list",
            "condition": ["id={id}".format(id=item['parent_id'])]
        }
        result = self.db.delete(delete_arr, is_close_db=False)
        return result

    def del_content(self, data):
        for item in data:
            self.__del_content(item)

    def __del_content(self, item):
        delete_arr = {
            "table": "content",
            "condition": ["parent_id={id}".format(id=item['parent_id'])]
        }
        result = self.db.delete(delete_arr, is_close_db=False)
        return result

    def get_data(self):
        select_arr = {
            "table": "content",
            "limit": [0, 10],
            "condition": ["video_src=''", "and", "url=''"]
        }
        data = self.db.select(select_arr, is_close_db=False)
        return data
コード例 #9
0
ファイル: mv_list_data.py プロジェクト: guaidashu/recipe
class MvListData(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.handle()

    def handle(self):
        data = self.get_data()
        for item in data:
            self.__mv_data(item)

    def __mv_data(self, item):
        select_arr = {
            "table": "list",
            "condition": ["id={id}".format(id=item['list_id'])]
        }
        try:
            data = self.db.select(select_arr, is_close_db=False)[0]
        except Exception as e:
            debug(e)
            return
        sql = self.db.getInsertSql(data, "tmp_list")
        debug(sql)
        result = self.db.insert(sql, is_close_db=False)
        return result

    def get_data(self):
        select_arr = {
            "table": "tmp_content",
        }
        data = self.db.select(select_arr, is_close_db=False)
        return data
コード例 #10
0
class MvListThread(object):
    def __init__(self):
        self.db = DBConfig()
        self.table_columns = (("id", "int"), ("img_src", "varchar"),
                              ("origin_src", "varchar"), ("url", "varchar"),
                              ("description", "text"), ("description_poster",
                                                        "text"),
                              ("star", "varchar"), ("title", "varchar"),
                              ("page_views", "int"), ("label", "text"),
                              ("category_id", "int"), ("director", "varchar"))
        self.handle_num = 0

    def __del__(self):
        self.db.closeDB()

    def run(self):
        thread_pool = ThreadPoolExecutor(max_workers=10)
        task_list = list()
        category = self.get_category()
        for item in category:
            task_list.append(thread_pool.submit(self.handle_data, item))
        debug("本次线程数量:{length}".format(length=len(task_list)))
        for i in as_completed(task_list):
            result = i.result()
            if result['code'] == 0:
                debug("电影类型为 {category} 的数据抓取完毕".format(
                    category=result['category']))
        debug("处理了{length}个线程".format(length=self.handle_num))

    @classmethod
    def get_category(cls):
        """
        :return:
        """
        mv_category = MvCategory()
        return mv_category.get_category()

    @classmethod
    def get_data(cls, category):
        """
        :param category:
        :return:
        """
        generate_url = GenerateUrl()
        url = generate_url.generate_url(domian=category)
        page_resource = curlData(url, open_virtual_ip=True)
        return page_resource

    def handle_data(self, category):
        """
        :param category:
        :return:
        """
        result = self.handle_data_child(category)
        while result['code'] == 0:
            category['url'] = result['url']
            result = self.handle_data_child(category)
        return {"code": 0, "category": category['name']}

    def handle_data_child(self, category):
        """
        :param category:
        :return:
        """
        code = 0
        page_resource = self.get_data(category['url'])
        # with open("tmp/mv_list_page.txt", "rb") as f:
        #     page_resource = f.read().decode("utf-8")
        #     f.close()
        bs = BeautifulSoup(page_resource, "html.parser")
        mv_list = self.__get_li_list(bs)
        for item in mv_list:
            self.__handle_data(item, category)
        lock.acquire()
        self.handle_num = self.handle_num + 1
        lock.release()
        next_url = self.__get_next_url(bs)
        if next_url == "":
            code = 1
        return {"code": code, "url": next_url}

    def __handle_data(self, item, category):
        insert_arr = dict()
        insert_arr['img_src'] = self.__get_img_src(item)
        insert_arr['origin_src'] = self.__get_origin_src(item)
        insert_arr['url'] = self.__get_url(item)
        insert_arr['category_id'] = category['id']
        insert_arr['description'] = self.__get_description(item)
        insert_arr['description_poster'] = self.__get_description_poster(item)
        insert_arr['star'] = self.__get_star(item)
        insert_arr['title'] = self.__get_title(item)
        insert_arr['page_views'] = self.__get_page_views(item)
        insert_arr['director'] = self.__get_director(item)
        insert_arr['label'] = self.__get_label(item)
        result = self.__save_data(insert_arr)
        if result == 0:
            debug("数据存储出错")
        else:
            debug("电影 {name} --> 列表存储成功".format(name=insert_arr['title']))

    def __save_data(self, insert_arr):
        """
        :param insert_arr:
        :return:
        """
        lock.acquire()
        sql = self.db.getInsertSql(insert_arr,
                                   "list",
                                   table_columns=self.table_columns)
        result = self.db.insert(sql, is_close_db=False)
        lock.release()
        return result

    @classmethod
    def __get_li_list(cls, bs):
        """
        :param bs:
        :return:
        """
        data = bs.find_all("figure")
        return data

    @classmethod
    def __get_img_src(cls, item):
        """
        :param item:
        :return:
        """
        img_src = item.find("img")
        try:
            img_src = img_src.attrs['src']
        except Exception as e:
            img_src = ""
            debug("电影封面图获取出错,出错信息:{error}".format(error=e))
        return img_src

    @classmethod
    def __get_origin_src(cls, item):
        """
        :param item:
        :return:
        """
        origin_src = item.find_all("img")
        try:
            origin_src = origin_src[1].attrs['src']
        except Exception as e:
            origin_src = ""
            debug("电影小图标获取出错,出错信息:{error}".format(error=e))
        return origin_src

    @classmethod
    def __get_url(cls, item):
        """
        :param item:
        :return:
        """
        url = item.find("a")
        try:
            url = url.attrs['href']
        except Exception as e:
            url = ""
            debug("电影详情链接地址获取出错,出错信息:{error}".format(error=e))
        return url

    @classmethod
    def __get_description(cls, item):
        """
        :param item:
        :return:
        """
        description = item.find("div", attrs={"class": "Description"})
        try:
            description = description.find("div")
            description = description.get_text().strip()
        except Exception as e:
            description = ""
            debug("电影描述获取出错,出错信息:{error}".format(error=e))
        return description

    @classmethod
    def __get_description_poster(cls, item):
        """
        :param item:
        :return:
        """
        description_poster = item.find("p",
                                       attrs={"class": "description_poster"})
        try:
            description_poster = description_poster.get_text().strip()
        except Exception as e:
            description_poster = ""
            debug("电影短述获取出错,出错信息:{error}".format(error=e))
        return description_poster

    @classmethod
    def __get_star(cls, item):
        """
        :param item:
        :return:
        """
        star = item.find("span", attrs={"class": "qualification"})
        try:
            star = star.get_text().strip()
        except Exception as e:
            star = ""
            debug("电影短述获取出错,出错信息:{error}".format(error=e))
        return star

    @classmethod
    def __get_title(cls, item):
        title = item.find("div", attrs={"class": "Title"})
        try:
            title = title.get_text().strip()
        except Exception as e:
            title = ""
            debug("电影标题获取出错,出错信息:{error}".format(error=e))
        return title

    @classmethod
    def __get_page_views(cls, item):
        """
        :param item:
        :return:
        """
        page_views = item.find("div", attrs={"class": "otros"})
        try:
            page_views = page_views.get_text().strip()
            page_views = page_views.replace(",", "")
            page_views = page_views.replace(" visitas", "")
        except Exception as e:
            page_views = 0
            debug("电影浏览量获取出错,出错信息:{error}".format(error=e))
        return page_views

    @classmethod
    def __get_director(cls, item):
        """
        :param item:
        :return:
        """
        director = item.find_all("div", attrs={"class": "otros"})
        try:
            director = director[1]
            director = director.get_text().strip()
            director = director.replace("Director: ", "")
        except Exception as e:
            director = ""
            debug("电影导演获取出错,出错信息:{error}".format(error=e))
        return director

    @classmethod
    def __get_label(cls, item):
        """
        :param item:
        :return:
        """
        label = item.find("div", attrs={"class": "tipcategorias"})
        label = label.find_all("span")
        s = ""
        try:
            for k, v in enumerate(label):
                if k == 0:
                    s = s + v.get_text().strip()
                else:
                    s = s + "," + v.get_text().strip()
        except Exception as e:
            debug("电影标签获取出错,出错信息:{error}".format(error=e))
        return s

    @classmethod
    def __get_next_url(cls, bs):
        """
        :param bs:
        :return:
        """
        next_url = bs.find_all("ul", attrs={"class": "pager"})
        try:
            next_url = next_url[0].find("a", attrs={"rel": "next"})
            next_url = next_url.attrs['href']
        except Exception as e:
            next_url = ""
            debug("下一页url获取出错,出错信息:{error}".format(error=e))
        debug(next_url)
        return next_url
コード例 #11
0
class ConstitutionThread(threading.Thread):
    def __init__(self, flfgID, zlsxid, showDetailType, province, index):
        threading.Thread.__init__(self)
        self.count = 0
        self.flfgID = flfgID
        self.zlsxid = zlsxid
        self.showDetailType = showDetailType
        self.province = province
        self.index = index
        self.rv = 0
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        debug("线程" + str(self.index) + "开始", True)
        global rv
        try:
            result = self.getConstitutionData(self.flfgID, self.zlsxid,
                                              self.showDetailType,
                                              self.province)
            if result != 1:
                tmp = "第" + str(self.index) + "条获取失败"
            else:
                tmp = "第" + str(self.index) + "条获取成功"
                mylock.acquire()
                rv = rv + 1
                mylock.release()
        except:
            tmp = "第" + str(self.index) + "条获取失败"
        print(tmp)

    def getConstitutionData(self, flfgID, zlsxid, showDetailType, province):
        # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数  flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到
        flag = False
        url = "http://210.82.32.100:8081/FLFG/flfgByID.action"
        get = dict()
        get['flfgID'] = flfgID
        get['showDetailType'] = showDetailType
        get['zlsxid'] = zlsxid
        get['keyword'] = ""
        get = urlencode(get)
        url = url + "?" + get
        while True:
            try:
                data = curlData(url, get, url)
                break
            except:
                pass
        try:
            data = data.decode("utf-8")
        except:
            pass
        # with open("constitution.txt", "wb") as f:
        #     f.write(data.encode("utf-8"))
        #     f.close()
        # with open("constitution.txt", "rb") as f:
        #     data = f.read().decode("utf-8")
        #     f.close()
        handleDataAll = BeautifulSoup(data, "html.parser")
        handleData = handleDataAll.find_all("table")
        columns_list = [
            'type', "department_type", 'office', 'reference_num', 'issue_date',
            'execute_date', 'timeliness'
        ]
        columns_name_list = [
            '资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:'
        ]
        # 获取头部基本信息
        try:
            table_data = handleData[0].find_all("td")
        except:
            table_data = "数据获取出错"
            flag = True
        type_data = dict()
        type_data['url'] = url
        for k, v in enumerate(table_data):
            try:
                if (k + 1) % 2 == 1:
                    type_data[columns_list[columns_name_list.index(
                        table_data[k].getText().strip())]] = table_data[
                            k + 1].getText().strip()
            except:
                type_data[columns_list[columns_name_list.index(
                    table_data[k].getText().strip())]] = "数据获取出错"
        # 接下来获取标题和内容
        try:
            type_data['title'] = handleDataAll.find_all(
                "div", attrs={"class": "bt"})[0].getText().strip()
        except:
            type_data['title'] = "标题获取出错"
            flag = True
        # 进行内容获取
        try:
            type_data['content'] = str(
                handleDataAll.find_all("div", attrs={"id": "content"})[0])
        except:
            flag = True
        type_data['province'] = province
        if flag:
            type_data['is_get_error'] = 1
        else:
            type_data['is_get_error'] = 0
        while True:
            try:
                mylock.acquire()
                sql = self.db.getInsertSql(type_data, "constitutions")
                result = self.db.insert(sql, is_close_db=False)
                mylock.release()
                break
            except Exception as e:
                debug(e)
        return result

    def getRv(self):
        global rv
        return rv

    def reset(self):
        global rv
        rv = 0
コード例 #12
0
class GetImages(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.handle()

    def handle(self):
        data = self.get_data()
        self.get_images(data, "large", img_url="img_url_large")

    @classmethod
    def start_thread(cls, data, fun, path, img_url, prefix):
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        result = list()
        for item in data:
            task = thread_pool.submit(fun, item, path, img_url, prefix)
            task_list.append(task)
        for i in as_completed(task_list):
            result.append(i.result())
        return result

    def get_images(self, data, path, img_url, prefix=""):
        self.start_thread(data, self.__get_images, path, img_url, prefix)

    def __get_images(self, item, path, img_url, prefix):
        page_resource = self.get_page_resource(prefix + item[img_url])
        with open(
                "static/images/{path}/{id}.jpg".format(path=path,
                                                       id=item['id']),
                "wb") as f:
            try:
                page_resource = page_resource.encode("utf-8")
            except Exception as e:
                debug(e)
            f.write(page_resource)
            f.close()
            update_data = {"status": 1}
            condition = ["id={id}".format(id=item['id'])]
            self.__update_data(update_data, "list", condition)

    @classmethod
    def get_page_resource(cls, url):
        data = curlData(url, open_virtual_ip=True)
        return data

    def __update_data(self, update_data, table, condition):
        update_arr = {
            "table": table,
            "set": update_data,
            "condition": condition
        }
        lock.acquire()
        self.db.update(update_arr, is_close_db=False)
        lock.release()

    def get_data(self):
        data = self.db.select(
            {
                "table": "list",
                "columns": ["id", "img_url", "img_url_large"],
                "condition": ["status=0"]
            },
            is_close_db=False)
        return data
コード例 #13
0
class RecipeContentThread(object):
    def __init__(self):
        self.recipe_list = RecipeListSpider()
        self.table_columns = (("id", "int"), ("img_url", "varchar"),
                              ("video_id", "varchar"), ("preparation",
                                                        "longtext"),
                              ("ingredients", "text"), ("name", "varchar"),
                              ("list_id", "int"))
        self.handle_num = 0
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        """
        :return:
        """
        data = self.get_list()
        self.start(data)

    def start(self, data):
        """
        :param data:
        :return:
        """
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        for item in data:
            task = thread_pool.submit(self.handle_data, item)
            task_list.append(task)
        for i in as_completed(task_list):
            result = i.result()
        debug("本次处理成功 {num} 个线程".format(num=self.handle_num))

    def get_list(self):
        """
        :return:
        """
        data = self.recipe_list.get_list()
        return data

    def handle_data(self, item):
        """
        :param item:
        :return:
        """
        page_resource = self.get_data(item)
        result = self.__handle_data(page_resource, item)
        if result['code'] == 0:
            debug("菜谱存储出错 --> {name}".format(name=item['name']))
        else:
            debug("菜谱存储成功 --> {name}".format(name=item['name']))
            self.__update_status(result['code'])
            lock.acquire()
            self.handle_num = self.handle_num + 1
            lock.release()
        return {"code": 0}

    @classmethod
    def get_data(cls, item):
        """
        :param item:
        :return:
        """
        url = CommonFunc().generate_content_url(item['url'])
        data = curlData(url, open_virtual_ip=True)
        return data

    def __handle_data(self, page_resource, item):
        """
        :param page_resource:
        :param item:
        :return:
        """
        bs = BeautifulSoup(page_resource, "html.parser")
        insert_arr = dict()
        insert_arr['video_id'] = self.__get_video_id(bs)
        insert_arr['img_url'] = self.__get_img_url(page_resource)
        insert_arr['name'] = item['name']
        insert_arr['preparation'] = self.__get_preparation(bs)
        insert_arr['ingredients'] = self.__get_ingredients(bs)
        insert_arr['list_id'] = item['id']
        result = self.__save_data(insert_arr)
        return {'code': result}

    def __save_data(self, insert_arr):
        lock.acquire()
        sql = self.db.getInsertSql(insert_arr,
                                   table="content",
                                   table_columns=self.table_columns)
        result = self.db.insertLastId(sql, is_close_db=False)
        lock.release()
        return result

    def __update_status(self, recipe_list_id):
        update_arr = {
            "table":
            "list",
            "set": {
                "status": "1"
            },
            "condition":
            ["id={recipe_list_id}".format(recipe_list_id=recipe_list_id)]
        }
        lock.acquire()
        self.db.update(update_arr, is_close_db=False)
        lock.release()

    @classmethod
    def __get_video_id(cls, bs):
        """
        :param bs:
        :return:
        """
        video_id = bs.find("iframe")
        try:
            video_id = video_id.attrs['src']
            video_id = re.findall('https://www.youtube.com/embed/([\w\W]*?)\?',
                                  video_id)[0]
        except Exception as e:
            video_id = ""
            debug("视频播放id获取出错,错误信息:{error}".format(error=e))
        return video_id

    @classmethod
    def __get_img_url(cls, bs):
        # img_url = bs.find("div", attrs={"class": "ytp-cued-thumbnail-overlay-image"})
        try:
            # img_url = img_url.attrs['style']
            img_url = re.findall('"image": "([\w\W]*?)"', str(bs))[0]
        except Exception as e:
            img_url = ""
            debug("菜谱图片链接获取出错,错误信息:{error}".format(error=e))
        return img_url

    @classmethod
    def __get_preparation(cls, bs):
        """
        :param bs:
        :return:
        """
        preparation = bs.find("div",
                              attrs={"class": "cs-recipe-single-preparation"})
        try:
            preparation = preparation.find("ul")
            preparation = str(preparation)
            preparation = re.findall("<ul>([\w\W]*?)<\/ul>", preparation)[0]
        except Exception as e:
            preparation = ""
            debug("菜谱做法获取出错,错误信息:{error}".format(error=e))
        return preparation

    @classmethod
    def __get_ingredients(cls, bs):
        ingredients = bs.find("div",
                              attrs={"class": "cs-ingredients-check-list"})
        ingredients_str = ""
        try:
            ingredients = ingredients.find("ul")
            ingredients = ingredients.find_all("li")
            for k, v in enumerate(ingredients):
                if k == 0:
                    ingredients_str = ingredients_str + v.get_text().strip()
                else:
                    ingredients_str = ingredients_str + "," + v.get_text(
                    ).strip()
        except Exception as e:
            ingredients_str = ""
            debug("配料获取出错,错误信息:{error}".format(error=e))
        return ingredients_str
コード例 #14
0
class GetVideoSrc(object):
    def __init__(self):
        self.cookie = {}
        # self.get_cookie()
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        data = self.get_content_list()
        self.handle_data(data)

    def handle_data(self, data):
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        for item in data:
            if item['url'] == '':
                continue
            else:
                task = thread_pool.submit(self.__handle_data, item)
                task_list.append(task)
                # self.__handle_data(item)
        for i in as_completed(task_list):
            result = i.result()

    def __handle_data(self, item):
        update_data = dict()
        update_data['status'] = 1
        update_data['video_src'] = self.__get_video_src(item)
        debug(update_data['video_src'])
        self.__update_data(item['id'], update_data)
        return {"code": 0}

    def __get_video_src(self, item):
        header = {
            # "Referer": "http://www.wyysdsa.com/",
            "User-Agent": getUserAgent(),
            # "Cache-Control": "max-age=0",
            # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
        }
        # url = "http://zeus.pelisplay.tv/embed/vip.php?u=Q1A5NUZJM1VDTWlUTk8wTEFmWGNQZDhnbWRIcmt6UVU0VGIxakpXOUF4Mi9yZW51Zi9yaXZlcXFoYnlwL3picC5hYm1uem4uampqLy86ZmNnZ3U&fondo_requerido="
        # url = "https://nl.tan90.club/test/testHeader.html"
        data = curlData(url=item['url'], header=header, cookie=self.cookie)
        # with open("tmp/content_detail.txt", "rb") as f:
        #     data = f.read().decode("utf-8")
        #     f.close()
        try:
            src = re.findall("JSON\.parse\('([\w\W]*?)'\)\);", data)[0]
            src = src.replace("\\", "")
            src = json.loads(src)
            src = src[0]['file']
        except Exception as e:
            src = ""
            debug(e)
        return src

    def __update_data(self, content_id, update_data):
        update_arr = {
            "table": "content",
            "set": update_data,
            "condition": ['id={content_id}'.format(content_id=content_id)]
        }
        lock.acquire()
        result = self.db.update(update_arr, is_close_db=False)
        lock.release()
        return result

    def get_content_list(self):
        data = self.db.select({
            "table": "content",
            "columns": ['id', 'url'],
            "condition": ['status=0']
        }, is_close_db=False)
        return data

    def get_cookie(self):
        header = {
            "User-Agent": getUserAgent(),
            # "Cache-Control": "max-age=0",
            # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
        }
        url = "https://www.pelisplay.tv/"
        self.cookie = getCookie(url, header=header)
        debug(self.cookie)
コード例 #15
0
class RecipeGetPage(object):
    def __init__(self):
        self.db = DBConfig()
        recipe_type = RecipeType()
        self.category = recipe_type.get()

    def __del__(self):
        self.db.closeDB()

    def get(self):
        """
        :return:
        """
        self.__get_recipe_page()

    def __get_recipe_page(self):
        """
        :return:
        """
        for item in self.category:
            url = CommonFunc().generate_url(category=item['keyword'])
            try:
                self.__get_recipe_page_data(url, item['id'])
            except Exception as e:
                debug("页面数量抓取出错,出错信息:{error}".format(error=e))

    def __get_recipe_page_data(self, url, recipe_category_id):
        """
        :param url:
        :param recipe_category_id:
        :return:
        """
        page_resource = curlData(url, open_virtual_ip=True)
        # with open("tmp/category_page_data.txt", "rb") as f:
        #     page_resource = f.read().decode("utf-8")
        #     f.close()
        bs = BeautifulSoup(page_resource, "html.parser")
        page_ul = bs.find_all("ul", attrs={"class": "page-numbers"})
        # remove prev page and next page
        for k, v in enumerate(page_ul[0]('a', attrs={"class": "next"})):
            v.extract()
        page_a = page_ul[0].find_all("a")
        page_span = page_ul[0].find("span")
        page_list = ""
        for k, v in enumerate(page_a):
            if k == 0:
                page_list = page_list + str(v.get_text()).strip()
            else:
                page_list = page_list + "," + str(v.get_text()).strip()
        page_list = page_list + "," + page_span.get_text().strip()
        page_list = {"page_list": page_list}
        # update to mysql
        update_arr = {
            "table": "type",
            "set": {
                "page_num": json.dumps(page_list)
            },
            "condition": ['id={id}'.format(id=recipe_category_id)]
        }
        result = self.db.update(update_arr, is_close_db=False)
        if result == 1:
            debug("id为{id}的菜谱类型页面数据抓取成功".format(id=recipe_category_id))
        else:
            debug("id为{id}的菜谱类型页面数据抓取失败".format(id=recipe_category_id))
コード例 #16
0
class GetConstitutionList(object):
    def __init__(self):
        # 数据库连接全局变量
        # self.ws_db = phoenix_db.DBConfig()
        self.count = 0
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()
        debug("本次一共获取到了%s条数据" % str(self.count))
        # self.ws_db.closeDB()

    def getAllConstitutionStart(self):
        try:
            record = self.db.select({"table": "constitutions_record", "condition": ['is_over=0']}, is_close_db=False)
            next_page = record[0]['page']
        except:
            next_page = 1
        while True:
            try:
                data = self.getConstitutionList(next_page)
            except:
                debug("内容获取出错,重新获取")
                continue
            # 获取下一页的页码
            try:
                tmpNextPage = re.findall('href="javascript:toUpDownPage\(\'(\d+)\'\);">下一页<\/a>', data)[0]
                debug("当前的页码是:%s" % str(next_page))
                debug("获取到的下一页页码是:%s" % str(tmpNextPage))
            except:
                debug("下一页的页码获取出错")
                break
            self.getAllConstitutionHandle(data, "北京")
            updatetArr = {
                "table": "constitutions_record",
                "condition": ['id=2'],
                "set": {
                    "page": tmpNextPage,
                    "is_over": 0
                }
            }
            self.db.update(updatetArr, is_close_db=False)
            if int(next_page) >= int(tmpNextPage):
                break
            else:
                next_page = tmpNextPage
        debug("本次抓取完毕")
        updatetArr = {
            "table": "constitutions_record",
            "condition": ['id=2'],
            "set": {
                "is_over": 1
            }
        }
        self.db.update(updatetArr, is_close_db=False)

    def getAllConstitution(self, fun):
        """
        获取所有法律法规
        :param fun:
        :return:
        """
        url = "http://210.82.32.100:8081/FLFG/"
        dcap = dict(DesiredCapabilities.FIREFOX)
        ip = virtualIp()
        dcap['phantomjs.page.customHeaders.X-FORWARDED-FOR'] = ip
        dcap['phantomjs.page.customHeaders.CLIENT-IP'] = ip
        firefox_options = Options()
        firefox_options.add_argument("--headless")
        firefox_options.add_argument('--disable-gpu')
        driver = webdriver.Firefox(firefox_options=firefox_options, desired_capabilities=dcap)
        driver.get(url)
        sleep(3)
        cloumn = driver.find_elements_by_class_name("cloumn")
        try:
            cloumn = cloumn[3]
        except:
            while True:
                try:
                    cloumn = cloumn[3]
                    break
                except:
                    sleep(1)
        cloumntitle = cloumn.find_elements_by_class_name("threecloumntitle")
        cloumntitleLength = len(cloumntitle)
        current_handle = driver.current_window_handle
        for i in range(cloumntitleLength):
            try:
                list_a = cloumntitle[i].find_elements_by_tag_name("a")
            except:
                list_a = list()
                debug("省份列表获取出错")
            list_a_len = len(list_a)
            for k in range(list_a_len):
                # 获取省份名
                try:
                    province = list_a[k].text
                except:
                    debug("省份获取出错,继续执行,省份标记锚点为" + str(k))
                    province = str(k)
                debug(province + ":")
                try:
                    list_a[k].click()
                except:
                    debug("点击失败")
                sleep(3)
                all_handles = driver.window_handles
                sleep(3)
                for handle in all_handles:
                    if handle != current_handle:
                        driver.switch_to_window(handle)
                        sleep(1)
                        data = driver.page_source
                        htmlData = BeautifulSoup(data, "html.parser")
                        try:
                            url = htmlData.find_all("iframe", attrs={"id": "rightpage"})[0].attrs['src']
                            url = re.sub("(有效)", "有效,已被修正,失效", url)
                            driver.execute_script("location.href='" + url + "'")
                            sleep(3)
                            # 进行点击50篇每页
                            try:
                                driver.find_element_by_id("span_pagesize_50").click()
                                sleep(3)
                            except:
                                pass
                            data = driver.page_source
                            # 获取每一页的text以便稍后判断
                            try:
                                nextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)">下一页', data)
                                nextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)\)', nextPage[0])
                                nextPage = re.findall("(\d+)", nextPage[1])
                                nextPage = nextPage[0]
                            except:
                                nextPage = 0
                            # 处理数据
                            while True:
                                tmpPage = int(nextPage) - 1
                                debug("第" + str(tmpPage) + "页:")
                                fun(data, province)
                                nextPageElement = driver.find_element_by_class_name("td")
                                try:
                                    nextPageElement = nextPageElement.find_elements_by_tag_name("a")[1]
                                    nextPageElement.click()
                                    sleep(3)
                                    data = driver.page_source
                                    try:
                                        tmpNextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)">下一页', data)
                                        tmpNextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)\)', tmpNextPage[0])
                                        tmpNextPage = re.findall("(\d+)", tmpNextPage[1])
                                        tmpNextPage = tmpNextPage[0]
                                        if nextPage == tmpNextPage:
                                            break
                                        else:
                                            nextPage = tmpNextPage
                                    except:
                                        break
                                except:
                                    break
                            # 点击下一页
                        except Exception as e:
                            debug(e)
                        debug("")
                        driver.close()
                        sleep(1)
                        driver.switch_to_window(all_handles[0])
                        sleep(2)
        driver.quit()

    def getConstitutionList(self, cur_page):
        url = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action"
        referer = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action"
        post = {
            "pagesize": "20",
            "pageCount": "500",
            "curPage": cur_page,
            "resultSearch": "false",
            # "lastStrWhere": "+SFYX:(有效)++^+ZLSX:(01~02~03~04~05~06~08~09~10~11~12~23)+NOT+TXTID=bj+^+SFFB=Y+",
            "lastStrWhere": "  SFYX:(有效~已被修正~失效) ^(ZLSX:1111 ~ZLSX=01)  ^ BMFL:(03)  ^ SFFB=Y ",
            "bt": "",
            "flfgnr": "",
            "sxx": "有效,已被修正,失效",
            # "sxx": "有效",
            "zlsxid": "12",
            "bmflid": "",
            "xldj": "",
            "bbrqbegin": "2018-09-01",
            "bbrqend": "2018-12-17",
            "sxrqbegin": "",
            "sxrqend": "",
            "zdjg": "",
            "bbwh": ""
        }
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
        }
        data = curlData(url=url, value=post, referer=referer, header=header)
        return data

    def getAllConstitutionHandle(self, data, province):
        data = re.findall(r'<a[\w\W]*?href="javascript:showLocation([\w\W]*?);"', data)
        old1 = ""
        old5 = ""
        i = 0
        thread_list = list()
        for k, v in enumerate(data):
            data[k] = tuple(v.split("'"))
            try:
                if data[k][1] == old1 and data[k][7] == old5:
                    continue
                i = i + 1
                thread_list.append(ConstitutionThread(data[k][1], data[k][7], data[k][3], province, i))
                old1 = data[k][1]
                old5 = data[k][7]
            except:
                pass
        i = len(thread_list)
        for m in range(i):
            thread_list[m].start()
        for m in range(i):
            thread_list[m].join()
        i = thread_list[0].getRv()
        # 重置计数器
        thread_list[0].reset()
        self.count = self.count + i
        return 1

    def getConstitutionData(self, flfgID, zlsxid, showDetailType, province):
        # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数  flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到
        flag = False
        url = "http://210.82.32.100:8081/FLFG/flfgByID.action"
        get = dict()
        get['flfgID'] = flfgID
        get['showDetailType'] = showDetailType
        get['zlsxid'] = zlsxid
        get['keyword'] = ""
        get = urlencode(get)
        url = url + "?" + get
        while True:
            try:
                data = curlData(url, get, url)
                break
            except:
                pass
        try:
            data = data.decode("utf-8")
        except:
            pass
        # with open("constitution.txt", "wb") as f:
        #     f.write(data.encode("utf-8"))
        #     f.close()
        # with open("constitution.txt", "rb") as f:
        #     data = f.read().decode("utf-8")
        #     f.close()
        handleDataAll = BeautifulSoup(data, "html.parser")
        handleData = handleDataAll.find_all("table")
        columns_list = ['type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date',
                        'timeliness']
        columns_name_list = ['资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:']
        # 获取头部基本信息
        try:
            table_data = handleData[0].find_all("td")
        except:
            table_data = "数据获取出错"
            flag = True
        type_data = dict()
        type_data['url'] = url
        for k, v in enumerate(table_data):
            try:
                if (k + 1) % 2 == 1:
                    type_data[columns_list[columns_name_list.index(table_data[k].getText().strip())]] = table_data[
                        k + 1].getText().strip()
            except:
                type_data[columns_list[columns_name_list.index(table_data[k].getText().strip())]] = "数据获取出错"
        # 接下来获取标题和内容
        try:
            type_data['title'] = handleDataAll.find_all("div", attrs={"class": "bt"})[0].getText().strip()
        except:
            type_data['title'] = "标题获取出错"
            flag = True
        # 进行内容获取
        try:
            type_data['content'] = str(handleDataAll.find_all("div", attrs={"id": "content"})[0])
        except:
            flag = True
        type_data['province'] = province
        if flag:
            type_data['is_get_error'] = 1
        else:
            type_data['is_get_error'] = 0
        while True:
            try:
                sql = self.db.getInsertSql(type_data, "constitutions")
                result = self.db.insert(sql, is_close_db=False)
                break
            except Exception as e:
                debug(e)
        return result
コード例 #17
0
class MvContentThread(object):
    def __init__(self):
        self.table_columns = (("id", "int"), ("parent_id", "int"), ("url",
                                                                    "text"))
        self.cookie = dict()
        self.cookie_get_num = 0
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        """
        :return:
        """
        data = self.get_list()
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        self.get_cookie()
        for item in data:
            task = thread_pool.submit(self.handle_data, item)
            task_list.append(task)
        for i in as_completed(task_list):
            result = i.result()

    @classmethod
    def get_list(cls):
        """
        :return:
        """
        mv_list = MvList()
        return mv_list.get_mv_list()

    def handle_data(self, item):
        """
        :param item:
        :return:
        """
        page_resource = self.get_data(item)
        result = self.__handle_data(page_resource, item)
        if result == 0:
            debug("数据存储出错 --> {name}".format(name=item['title']))
        else:
            debug("数据存储成功 --> {name}".format(name=item['title']))
        return {"code": 0}

    def get_data(self, item):
        """
        :param item:
        :return:
        """
        url = item['url']
        page_resource = curlData(url, cookie=self.cookie, open_virtual_ip=True)
        return page_resource

    def get_cookie(self):
        url = settings.DOMAIN
        self.cookie_get_num = self.cookie_get_num + 1
        self.cookie = getCookie(url, open_virtual_ip=True)

    def __handle_data(self, page_resource, item):
        """
        :param page_resource:
        :param item:
        :return:
        """
        bs = BeautifulSoup(page_resource, "html.parser")
        insert_arr = dict()
        insert_arr['url'] = self.__get_url(bs, page_resource, item)
        insert_arr['parent_id'] = item['id']
        code = self.__save_data(insert_arr)
        return {"code": code}

    def __save_data(self, insert_arr):
        sql = self.db.getInsertSql(insert_arr,
                                   "content",
                                   table_columns=self.table_columns)
        lock.acquire()
        result = self.db.insert(sql, is_close_db=False)
        return result

    def __get_url(self, bs, page_resource, item):
        """
        :param bs:
        :param page_resource:
        :param item:
        :return:
        """
        url = bs.find("tbody", attrs={"id": "servidores_online"})
        url_str = ""
        try:
            token = re.findall('window.laravel_token = "([\w\W]*?)";',
                               str(page_resource))[0]
        except Exception as e:
            debug("电影播放链接 _token 获取出错,出错信息:{error}".format(error=e))
            return url_str
        try:
            url = url.find_all("tr")
            for k, v in enumerate(url):
                if k < 1:
                    continue
                data = v.find("div", attrs={
                    "class": "embedplayer"
                }).attrs['data-player']
                post = {"data": data, "tipo": "videohost", "_token": token}
                url_str = self.__get_url_curl(post, referer=item['url'])
                break
        except Exception as e:
            url_str = ""
            debug("电影播放链接获取出错,出错信息:{error}".format(error=e))
        return url_str

    def __get_url_curl(self, post, referer):
        """
        :param post:
        :param referer:
        :return:
        """
        headers = {
            "user-agent": getUserAgent(),
            "origin": "https://www.pelisplay.tv",
            "referer": referer
        }
        url = "https://www.pelisplay.tv/entradas/procesar_player"
        data = curlData(url, value=post, cookie=self.cookie, header=headers)
        try:
            data = json.loads(data)
        except Exception as e:
            lock.acquire()
            self.get_cookie()
            lock.release()
            if self.cookie_get_num < 3:
                return self.__get_url_curl(post, referer=referer)
            else:
                data = {"estado": 500}
                debug("播放链接获取出错,错误信息:{error}".format(error=e))
        if data['estado'] == 200:
            data = data['data']
        else:
            data = ""
        self.cookie_get_num = 0
        return data
コード例 #18
0
ファイル: mv_category.py プロジェクト: guaidashu/movie_spider
class MvCategory(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def get_category(self):
        """
        :return:
        """
        select_arr = {"table": "type"}
        category = self.db.select(select_arr, is_close_db=False)
        if not category:
            return []
        return category

    def get(self):
        """
        :return:
        """
        # page_resource = self.get_data()
        with open("tmp/index_page.txt", "rb") as f:
            page_resource = f.read().decode("utf-8")
            f.close()
        bs = BeautifulSoup(page_resource, "html.parser")
        category_list = self.__get_category_list(bs)
        for item in category_list:
            self.handle_data(item)

    def handle_data(self, item):
        """
        :param item:
        :return:
        """
        insert_arr = dict()
        insert_arr['status'] = 0
        insert_arr['url'] = self.__get_category_url(item)
        insert_arr['img_src'] = self.__get_category_img_src(item)
        insert_arr['icon_img_src'] = self.__get_category_icon_img_src(item)
        insert_arr['name'] = self.__get_category_name(item)
        insert_arr['description'] = self.__get_category_description(item)
        if self.__save_date(insert_arr):
            debug("类型存储成功")
        else:
            debug("类型存储失败")

    def __save_date(self, insert_arr):
        """
        :param insert_arr:
        :return:
        """
        table_columns = (("id", "int"), ("img_src", "varchar"),
                         ("icon_img_src", "varchar"), ("url", "varchar"),
                         ("name", "varchar"), ("description", "text"))
        sql = self.db.getInsertSql(insert_arr,
                                   table="type",
                                   table_columns=table_columns)
        result = self.db.insert(sql, is_close_db=False)
        if result == 0:
            return False
        return True

    @classmethod
    def get_data(cls):
        """
        :return:
        """
        url = settings.DOMAIN
        data = curlData(url, open_virtual_ip=True)
        return data

    @classmethod
    def __get_category_list(cls, bs):
        """
        :param bs:
        :return:
        """
        category_list = bs.find_all("ul", attrs={"class": "owl-carousel"})
        try:
            category_list = category_list[0].find_all("li",
                                                      attrs={"class": "item"})
        except Exception as e:
            category_list = list()
            debug("类型列表获取失败,错误信息:{error}".format(error=e))
        return category_list

    @classmethod
    def __get_category_url(cls, item):
        """
        :param item:
        :return:
        """
        category_url = item.find("a")
        try:
            category_url = category_url.attrs['href']
        except Exception as e:
            category_url = ""
            debug("分类url链接获取失败,错误信息:{error}".format(error=e))
        return category_url

    @classmethod
    def __get_category_img_src(cls, item):
        """
        :param item:
        :return:
        """
        category_img_src = item.find("img")
        try:
            category_img_src = category_img_src.attrs['src']
        except Exception as e:
            category_img_src = ""
            debug("图片地址获取失败,错误信息:{error}".format(error=e))
        return category_img_src

    @classmethod
    def __get_category_icon_img_src(cls, item):
        """
        get icon img src
        :param item:
        :return:
        """
        category_icon_img_src = item.find("img")
        try:
            category_icon_img_src = category_icon_img_src.attrs['src']
        except Exception as e:
            category_icon_img_src = ""
            debug("icon图片地址获取失败,错误信息:{error}".format(error=e))
        return category_icon_img_src

    @classmethod
    def __get_category_name(cls, item):
        """
        get category name
        :param item:
        :return:
        """
        category_name = item.find("div", attrs={"class": "category-name"})
        try:
            category_name = category_name.get_text().strip()
        except Exception as e:
            category_name = ""
            debug("类型名获取失败,错误信息:{error}".format(error=e))
        return category_name

    @classmethod
    def __get_category_description(cls, item):
        """
        :param item:
        :return:
        """
        category_description = item.find(
            "div", attrs={"class": "category-description"})
        try:
            category_description = category_description.get_text().strip()
        except Exception as e:
            category_description = ""
            debug("类型描述获取失败,错误信息:{error}".format(error=e))
        return category_description
コード例 #19
0
ファイル: get_recipe_video.py プロジェクト: guaidashu/recipe
class GetRecipeVideo(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.download()

    def download(self):
        data = self.get_tmp_content()
        self.__download(data)

    def __download(self, data):
        for item in data:
            url = "https://www.youtube.com/watch?v=%s" % item['video_id']
            debug("开始抓取:--> {video_id}".format(video_id=item['video_id']))
            try:
                youtube = YouTube(url)
                youtube.streams.filter(subtype="mp4").first().download(
                    "/Users/cpx/code/py/recipe/data/recipe/",
                    filename=item['video_id'])
                self.__update_data(item['id'])
            except Exception as e:
                debug(e)

    def __update_data(self, list_id):
        """
        :param list_id:
        :return:
        """
        update_arr = {
            "table": "tmp_content",
            "set": {
                "status": 1
            },
            "condition": ['id={list_id}'.format(list_id=str(list_id))]
        }
        result = self.db.update(update_arr, is_close_db=False)
        return result

    def get_tmp_content(self):
        data = self.db.select(
            {
                "table": "tmp_content",
                "columns": ['id', 'video_id'],
                "condition": ['status=0']
            },
            is_close_db=False)
        return data

    def handle_data(self):
        self.move_data()
        # data = self.get_data()

    def get_data(self):
        select_arr = {"table": "recipe_content"}
        data = self.db.select(select_arr, is_close_db=False)
        return data

    def move_data(self):
        category = self.get_category()
        for item in category:
            data = self.get_list_by_type_id(item['id'])
            self.__move_data(data)

    def __move_data(self, data):
        for item in data:
            content = self.get_content_by_list_id(item['id'])
            try:
                content = content[0]
                content['status'] = 0
                self.__insert_data(content)
            except Exception as e:
                debug(e)

    def __insert_data(self, insert_arr):
        sql = self.db.getInsertSql(insert_arr, "tmp_content")
        result = self.db.insert(sql, is_close_db=False)
        return result

    def get_list_by_type_id(self, type_id):
        data = self.db.select(
            {
                "table": "list",
                "condition":
                ['recipe_type_id={type_id}'.format(type_id=type_id)],
                "limit": [0, 20]
            },
            is_close_db=False)
        return data

    def get_content_by_list_id(self, list_id):
        data = self.db.select(
            {
                "table": "content",
                "columns": ['video_id', 'list_id'],
                "condition":
                ["list_id={list_id}".format(list_id=str(list_id))]
            },
            is_close_db=False)
        return data

    def get_category(self):
        data = self.db.select({
            "table": "type",
            "condition": ['keyword<>""']
        },
                              is_close_db=False)
        return data
コード例 #20
0
class RecipeListThread(object):
    def __init__(self, page_list, category):
        """
        :param page_list:
        :param category:
        """
        self.page_list = page_list
        self.category = category
        self.handle_num = 0
        # 自己定义字段可以避免重复查询字段的类型自动拼接
        self.table_columns = (("id", "int"), ("name", "varchar"),
                              ("url", "varchar"), ("img_url", "varchar"),
                              ("introduce", "text"), ("recipe_type_id", "int"),
                              ("status", "int"), ("page_views", "int"))
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        """
        start threading
        :return:
        """
        thread_pool = ThreadPoolExecutor(max_workers=10)
        task_list = list()
        # 添加线程
        for p in self.page_list:
            task = thread_pool.submit(self.get_data, p)
            task_list.append(task)
        debug("本次线程数量:{length}".format(length=len(task_list)))
        # 开始并阻塞线程
        for i in as_completed(task_list):
            result = i.result()
            if result['code'] == 0:
                debug("{category} -- 第{page}页的数据获取完毕".format(
                    category=self.category['keyword'], page=result['page']))
        debug("处理了{length}个线程".format(length=self.handle_num))

    def get_data(self, page):
        """
        :param page:
        :return:
        """
        global lock
        url = CommonFunc().generate_url(page, self.category['keyword'])
        # 获取数据
        page_resource = curlData(url, open_virtual_ip=True)
        # with open("tmp/recipe_list.txt", "rb") as f:
        #     page_resource = f.read().decode("utf-8")
        #     f.close()
        # 处理并存储数据
        self.handle_data(page_resource)
        lock.acquire()
        self.handle_num = self.handle_num + 1
        lock.release()
        return {"code": 0, "page": page}

    def handle_data(self, page_resource):
        """
        :param page_resource:
        :return:
        """
        bs = BeautifulSoup(page_resource, "html.parser")
        li_list = self.__get_li_list(bs)
        lock.acquire()
        for item in li_list:
            insert_arr = dict()
            insert_arr['recipe_type_id'] = self.category['id']
            insert_arr['status'] = 0
            insert_arr['img_url'] = self.__get_img_url(item)
            insert_arr['url'] = self.__get_url(item)
            insert_arr['introduce'] = self.__get_introduce(item)
            insert_arr['page_views'] = self.__get_page_views(item)
            insert_arr['name'] = self.__get_name(item)

            sql = self.db.getInsertSql(insert_arr,
                                       table="list",
                                       table_columns=self.table_columns)
            result = self.db.insert(sql, is_close_db=False)
            if result == 1:
                debug("插入成功")
            else:
                debug("插入失败")
        lock.release()

    @classmethod
    def __get_li_list(cls, bs):
        container_list = bs.find_all("div",
                                     attrs={"class": "cs-recipes-category"})
        li_list = container_list[0].find_all("li",
                                             attrs={"class": "cs-recipe"})
        return li_list

    @classmethod
    def __get_img_url(cls, item):
        img_url = item.find("img")
        try:
            img_url = img_url.attrs['src']
        except Exception as e:
            img_url = ""
            debug("图片链接获取出错,错误信息:{error}".format(error=e))
        return img_url

    @classmethod
    def __get_url(cls, item):
        url = item.find("a")
        try:
            url = url.attrs['href']
        except Exception as e:
            url = ""
            debug("详情页链接获取出错,错误信息:{error}".format(error=e))
        return url

    @classmethod
    def __get_introduce(cls, item):
        introduce = item.find_all("span")
        try:
            introduce = introduce[0].get_text().strip()
        except Exception as e:
            introduce = ""
            debug("介绍获取出错,错误信息:{error}".format(error=e))
        return introduce

    @classmethod
    def __get_page_views(cls, item):
        page_views = item.find_all("span")
        try:
            page_views = page_views[1].get_text().strip()
            # 去掉逗号
            page_views = page_views.replace(",", "")
            page_views = page_views.replace(" Plays", "")
        except Exception as e:
            page_views = 0
            debug("浏览量获取出错,错误信息:{error}".format(error=e))
        return page_views

    @classmethod
    def __get_name(cls, item):
        name = item.find("h3")
        try:
            name = name.get_text().strip()
        except Exception as e:
            name = ""
            debug("菜谱名获取出错,错误信息:{error}".format(error=e))
        return name
コード例 #21
0
class RecipeListSpider(object):
    def __init__(self):
        self.db = DBConfig()
        self.recipe_type = RecipeType()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        """
        start get recipe list
        :return:
        """
        self.get_recipe_list()

    def get_list(self, condition=[], limit=[]):
        """
        :return:
        """
        select_arr = {"table": "list", "condition": ['status=0']}
        data = self.db.select(select_arr, is_close_db=False)
        return data

    def get_category(self):
        """
        get a category's all page num
        :return:
        """
        return self.recipe_type.get_category()

    def get_recipe_list(self):
        """
        :return:
        """
        self.__get_recipe_list()

    def __get_recipe_list(self):
        """
        :return:
        """
        info = self.get_category()
        for item in info:
            self.__get_recipe_list_child(item)

    def __set_status(self, category_id):
        """
        :param category_id:
        :return:
        """
        update_arr = {
            "table": "type",
            "set": {
                "status": 1
            },
            "condition": ['id={category_id}'.format(category_id=category_id)]
        }
        result = self.db.update(update_arr, is_close_db=False)
        if result == 0:
            debug("更新状态出错, 出错原因:unknown")
            return

    def __get_recipe_list_child(self, info):
        """
        :param info:
        :return:
        """
        try:
            page_list = json.loads(info['page_num'])['page_list']
        except Exception as e:
            debug(e)
            self.__set_status(info['id'])
            return
        category = info['keyword']
        if category == "":
            self.__set_status(info['id'])
            return
        page_list = page_list.split(",")
        recipe_list_thread = RecipeListThread(page_list, info)
        recipe_list_thread.run()
        self.__set_status(info['id'])
コード例 #22
0
ファイル: get_img_url_large.py プロジェクト: guaidashu/recipe
class GetImgUrlLarge(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        self.handle()

    def handle(self):
        data = self.get_data()
        for item in data:
            result = self.__handle(item)
            if result['result_1'] == 1 and result['result_2'] == 1 and result[
                    'result_3'] == 1:
                debug(item['img_url'])
            else:
                break

    def __handle(self, item):
        img_url = item['img_url']
        try:
            s = re.findall('squarethumbnails\/([\w\W]*.)', img_url)[0]
        except Exception as e:
            s = ''
            debug(e)
        if s == '':
            return
        s = 'http://www.laurainthekitchen.com/largethumbnails/' + s
        result = self.__update_data(s, item)
        return result

    def __update_data(self, s, item):
        update_arr_list = {
            "table": "list",
            "set": {
                "img_url_large": s,
                "status": 1
            },
            "condition": ["id={id}".format(id=item['id'])]
        }
        result_1 = self.db.update(update_arr_list, is_close_db=False)
        del update_arr_list['set']['status']
        update_arr_list['table'] = "tmp_list"
        result_2 = self.db.update(update_arr_list, is_close_db=False)
        update_arr_list['table'] = "content"
        update_arr_list['condition'] = ["list_id={id}".format(id=item['id'])]
        result_3 = self.db.update(update_arr_list, is_close_db=False)
        return {
            "result_1": result_1,
            "result_2": result_2,
            "result_3": result_3
        }

    def get_data(self):
        select_arr = {
            "table": "list",
            "columns": ["img_url", "id"],
            "condition": ["status=0"]
        }
        data = self.db.select(select_arr, is_close_db=False)
        return data
コード例 #23
0
ファイル: recipe_type.py プロジェクト: guaidashu/recipe
class RecipeType(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def get(self):
        """
        :return:
        """
        select_arr = {"table": "type", "condition": ['nav_type=2']}
        data = self.db.select(select_arr, is_close_db=False)
        # check whether have any data, if not, get all category
        if not data:
            self.get_recipe_type()
            data = self.db.select(select_arr, is_close_db=False)
        return data

    def get_category(self):
        """
        :return:
        """
        select_arr = {"table": "type", "condition": ['status=0']}
        return self.db.select(select_arr, is_close_db=False)

    def get_recipe_type(self):
        """
        :return:
        """
        category_li = self.__handle_category()
        self.__handle_category_data(category_li)

    @classmethod
    def __handle_category(cls):
        """
        :return:
        """
        url = CommonFunc().generate_url()
        page_resource = curlData(url, open_virtual_ip=True)
        bs_data = BeautifulSoup(page_resource, "html.parser")
        category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"})
        # only get the next level's li(tag), not include offspring(need to add 'recursive=False')
        return category_ul[0].find_all("li", recursive=False)

    def __handle_category_data(self, category_li, handle_type=1, parent_id=0):
        """
        :param category_li:
        :param handle_type:
        :param parent_id:
        :return:
        """
        table_columns = (("id", "int"), ("name", "varchar"),
                         ("page_num", "longtext"), ("nav_type", "int"),
                         ("keyword", "varchar"), ("parent_id", "int"))
        for item in category_li:
            insert_arr = {"parent_id": 0, "nav_type": 2}
            try:
                href = item.find("a").attrs['href']
                try:
                    insert_arr['keyword'] = re.findall('category=([\w\W]*.)',
                                                       href)[0]
                except Exception as e:
                    debug(e)
                if handle_type == 2:
                    insert_arr['parent_id'] = parent_id
                if href == "#":
                    insert_arr['name'] = item.find("span").getText().strip()
                    insert_arr['nav_type'] = 1
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    lastest_id = self.db.insertLastId(sql, is_close_db=False)
                    if lastest_id == 0:
                        debug("get data error")
                        continue
                    self.__handle_category_data(item.find_all("li"), 2,
                                                lastest_id)
                else:
                    insert_arr['name'] = item.getText().strip()
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    self.db.insert(sql, is_close_db=False)
            except Exception as e:
                debug(e)