Ejemplo n.º 1
0
    def save_data_to_mongodb(self,request):
        #新建类目表
        #数据的话从文件里面读出
        repo_id = request.POST['repo_id']
        #create_id = request.POST['create_id']
        file_id = request.POST['file_id']

        try:
            news_col = Mongodb(db='knowledge', collection='text').get_collection()
        except Exception:
            return self.error("mongodb没有数据库或者表")

        try:
            ret_file_data = TDataAcquisitionLog.objects.get(id=file_id)
        except Exception:
            return  self.error("id没有对应文件")

        ret_file_data_dict = model_to_dict(ret_file_data)
        file_name = ret_file_data_dict['data_source']
        path_str =  ret_file_data_dict['data_access']
        try:
            data = xlrd.open_workbook(path_str + file_name)
        except Exception:
            return self.error("没有找到对应文件")

        table_name = data.sheet_names()[0]
        table = data.sheet_by_name(table_name)
        list_attribute = list(table.row_values(0))
        list_json = []
        row = table.nrows
        col = table.ncols

        for i in range(1, row):
            dict_data = {}
            for j in range(0, col):
                dict_data[list_attribute[j]] = table.row_values(i)[j]
            dict_data['file_id']=file_id
            x=news_col.insert_one(dict_data)

        ret_l={'context':'success'}

        return render(request, 'test1.html', context=ret_l)
Ejemplo n.º 2
0
class BaikeSpider(Driver):
    urls = []
    # tags = ["电影", "演员", "导演", "编剧", "制片人"]
    count = 0

    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 proxy_ip_from="",
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy,
                        proxy_ip_from=proxy_ip_from)
        # self.baike_col = Mongodb(db='movies1', collection="baike_member").get_collection()
        self.baike_col = Mongodb(db='baike',
                                 collection="test1").get_collection()

    def get_infos(self, url="", extensive_properties=None):
        if extensive_properties is None:
            extensive_properties = {}
        self.fast_new_page(url=url)
        relationship_urls = []
        relationship_tags = []
        if self.judge_web_element_exist_by_css_selector(
                css_selector=
                "div.polysemantList-header-title > div.toggle.expand"):
            synonym = self.until_presence_of_element_located_by_css_selector(
                css_selector=
                "div.polysemantList-header-title > div.toggle.expand > a")
            self.scroll_to_center(synonym)
            synonym.click()
            member_urls = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector=
                "ul.polysemantList-wrapper.cmn-clearfix > li.item > a")
            for item in member_urls:
                # for tag in self.tags:
                #     if tag in item.text:
                relationship_urls.append(item.get_attribute("href"))
                relationship_tags.append(item.text)
                # break
        if self.driver.current_url not in self.urls:
            data = self.get_base_info_from_baike()
            if data is not None:
                current_tag = self.until_presence_of_element_located_by_css_selector(
                    css_selector=
                    "ul.polysemantList-wrapper.cmn-clearfix > li.item > span.selected"
                )
                data.setdefault("tag", current_tag.text)
                data.update(extensive_properties)
                print(data)
                self.baike_col.insert_one(data)
                self.urls.append(self.driver.current_url)
            self.close_curr_page()

        for item in relationship_urls:
            if item not in self.urls:
                self.fast_new_page(url=item)
                data = self.get_base_info_from_baike()
                if data is not None:
                    data.setdefault(
                        "tag",
                        relationship_tags[relationship_urls.index(item)])
                    data.update(extensive_properties)
                    print(data)
                    self.baike_col.insert_one(data)
                    self.urls.append(item)
                self.close_curr_page()
        if self.count == 10:
            return False
        return True

    def get_base_info_from_baike(self):
        try:
            if not self.judge_web_element_exist_by_css_selector(
                    css_selector=
                    "div.content > div.main-content div.basic-info.cmn-clearfix"
            ):
                return
            basic_info_div = self.until_presence_of_element_located_by_css_selector(
                css_selector=
                "div.content > div.main-content div.basic-info.cmn-clearfix")

            if self.judge_web_element_exist_by_css_selector(
                    ele=basic_info_div, css_selector="a.toggle.toExpand"):
                btn = self.until_presence_of_element_located_by_css_selector(
                    ele=basic_info_div, css_selector="a.toggle.toExpand")
                self.scroll_to_center(btn)
                btn.click()

            basic_info_name = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="dl > dt.basicInfo-item.name", ele=basic_info_div)
            basic_info_value = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="dl > dd.basicInfo-item.value",
                ele=basic_info_div)
            data = {}
            for i in range(len(basic_info_name)):
                name = basic_info_name[i].text.replace(" ", "")
                value = basic_info_value[i].text
                if name == "" or value.replace(" ", "") == "":
                    continue
                data.setdefault(name, value)
            data.setdefault("url", self.driver.current_url)
            if self.judge_web_element_exist_by_css_selector(
                    css_selector="div.lemma-summary"):
                base_infos = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.lemma-summary").text
                data.setdefault("基础信息", base_infos)
            self.count = 0
            return data
        except Exception:
            self.count += 1
Ejemplo n.º 3
0
class MaoyanSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy)
        self.boxoffice_col = Mongodb(db='knowledge',
                                     collection='text').get_collection()
        self.news_col = Mongodb(db='movies1',
                                collection='news').get_collection()

    @staticmethod
    def find_key_from_value(dict, value):
        key_list = dict.keys()
        for key in key_list:
            if value == dict[key]:
                return key
        return None

    def get_boxoffice_infos_from_one_page(self,
                                          url="",
                                          datetime="",
                                          user_id=-1,
                                          repo_id=-1):
        """
        获取猫眼此时刻票房数据
        :param repo_id:
        :param user_id:
        :param datetime:
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        time.sleep(1)
        if not self.judge_web_element_exist_by_css_selector(
                css_selector="div.dashboard-content"):
            self.close_curr_page()
            return True
        theads = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.dashboard-list > table.dashboard-table.table-header > thead > tr > th"
        )[1:]
        theads = [item.text for item in theads]
        if not self.judge_web_element_exist_by_css_selector(
                css_selector=
                "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        ):
            self.close_curr_page()
            return False
        boxoffice_infos = self.until_presence_of_all_elements_located_by_css_selector(
            css_selector=
            "div.movielist-container > div.movielist > table.dashboard-table > tbody > tr"
        )
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        boxoffice_data_from_the_page = []
        for item in boxoffice_infos:
            one_boxoffice_data = {}
            boxoffice_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="td", ele=item)
            movie_name = self.until_presence_of_element_located_by_css_selector(
                css_selector="div > div.moviename-desc > p.moviename-name",
                ele=boxoffice_info[0])
            movie_info = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector=
                "div > div.moviename-desc > p.moviename-info > span",
                ele=boxoffice_info[0])
            one_boxoffice_data.setdefault("日期", datetime)
            one_boxoffice_data.setdefault("电影名", movie_name.text)
            one_boxoffice_data.setdefault("上映时间", movie_info[0].text)
            one_boxoffice_data.setdefault("总票房", movie_info[1].text)
            boxoffice_info = boxoffice_info[1:]
            for i in range(len(boxoffice_info)):
                one_boxoffice_data.setdefault(theads[i],
                                              boxoffice_info[i].text)
            one_boxoffice_data.setdefault("crawl_time", crwal_time)
            one_boxoffice_data.setdefault("crawl_from", "猫眼专业版")
            # self.piaofang_col.insert_one(one_piaofang_data)
            judge_result = self.judge_data_exist_by_keys(
                collection=self.boxoffice_col,
                keys={
                    "user_id": user_id,
                    "repo_id": repo_id,
                    "value.日期": one_boxoffice_data["日期"],
                    "value.电影名": one_boxoffice_data["电影名"],
                    "value.crawl_from": one_boxoffice_data["crawl_from"]
                })
            if judge_result is True:
                boxoffice_data_from_the_page.append(one_boxoffice_data)
            else:
                return boxoffice_data_from_the_page, False

        self.close_curr_page()
        return boxoffice_data_from_the_page, True

    def get_boxoffice_infos(self, spider_id, user_id, repo_id, spider_name):
        date = datetime.datetime.strptime("2020-01-23", '%Y-%m-%d')
        # date = datetime.datetime.now()
        final_result = []
        while True:
            data_list, result = self.get_boxoffice_infos_from_one_page(
                url="http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                datetime=str(date)[:10],
                user_id=int(user_id),
                repo_id=int(repo_id))
            final_result.extend(data_list)
            if result is False:
                break
            date = date + datetime.timedelta(days=-1)
        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(
            create_time=timezone.now(),
            data_source_name=spider_name,
            data_access="爬虫",
            repo_id=int(repo_id),
            create_id=int(user_id),
            data_path="")
        TEntityExtractionLog.objects.create(
            data_acquisition_id=one_data_acquisition_log.id,
            is_extract=0,
            entity_number=0,
            extract_time=timezone.now(),
            create_id=int(user_id),
            repo_id=int(repo_id))

        for item in final_result:
            self.boxoffice_col.insert_one({
                "file_id": one_data_acquisition_log.id,
                "category_id": -1,
                "spider_id": int(spider_id),
                "user_id": int(user_id),
                "repo_id": int(repo_id),
                "value": item
            })

    def run_spider(self, url=""):
        lastest_info = self.boxoffice_col.find().sort("datetime", -1).limit(1)
        date = datetime.datetime.strptime(lastest_info[0]["datetime"],
                                          '%Y-%m-%d')
        date = date + datetime.timedelta(days=1)
        now = datetime.datetime.now()
        while date < now:
            self.get_boxoffice_infos_from_one_page(
                "http://piaofang.maoyan.com/dashboard/movie?date=" +
                str(date)[:10],
                str(date)[:10])
            date = date + datetime.timedelta(days=1)
Ejemplo n.º 4
0
class DoubanSpider(Driver):
    # 爬取电影人的豆瓣url集合,用以筛去所有重复的url
    member_set = set()

    def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, isloadimages=True, isproxy=False,
                 proxy_ip_from="", spider_id='2', data_queue=None):
        Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless, isloadimages=isloadimages, isproxy=isproxy,
                        proxy_ip_from=proxy_ip_from)
        self.movie_col = Mongodb(db='knowledge', collection='text').get_collection()
        # self.member_col = Mongodb(db='movies', collection='member').get_collection()
        # self.comment_col = Mongodb(db='movies', collection="comments").get_collection()

    def get_member_info(self, url=""):
        """
        获取一个电影人的具体个人信息
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        if "条目不存在" in self.driver.title or "页面不存在" in self.driver.title:
            self.close_curr_page()
            return None
        name = self.driver.title[:-4].strip()
        member_data = {}
        member_data.setdefault("member_name", name)
        member_data.setdefault("douban_url", url)
        member_div_infos = self.until_presence_of_all_elements_located_by_css_selector("div.info > ul > li")
        for item in member_div_infos:
            item = item.text.split(":")
            key = item[0].strip()
            if len(item) > 2:
                value = ":".join(item[1:])
            else:
                value = item[1]
            if key == "性别" or key == "星座" or key == "出生日期" or key == "出生地" or key == "官方网站":
                member_data.setdefault(key, value.strip())
            else:
                member_data.setdefault(key, [item.strip() for item in value.split("/")])
        self.close_curr_page()
        return member_data
        # self.member_col.insert_one(member_data)
        # self.info_log(data="取得个人资料数据----" + member_data["member_name"])
        # return True

    def get_member_awards(self, url=""):
        """
        获取一个电影人曾经获得的所有荣誉
        :param url:
        :return:
        """
        self.fast_new_page(url=url)
        awards_div = self.until_presence_of_element_located_by_css_selector("div.grid-16-8.clearfix > div.article")
        result = []
        try:
            awards_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="div.awards", ele=awards_div, timeout=5)
        except Exception:
            self.close_curr_page()
            return result
        for temp in awards_info:
            awards_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.hd > h2", ele=temp)
            awards = self.until_presence_of_all_elements_located_by_css_selector(css_selector="ul.award", ele=temp)
            for award in awards:
                data = {}
                award_info = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=award)
                data.setdefault("time", awards_time.text)
                data.setdefault("award_from", award_info[0].text)
                data.setdefault("award", award_info[1].text)
                data.setdefault("relevant_movie", award_info[2].text)
                result.append(data)
        self.close_curr_page()
        return result

    def get_member_movies(self, url=""):
        """
        获取一个电影人参与过的所有电影列表
        :param url:
        :return:
        """
        movies = []
        self.fast_new_page(url=url)
        while True:
            movies_a = self.until_presence_of_all_elements_located_by_css_selector("div.article > div.grid_view > ul > li > dl > dd > h6 > a")
            for temp in movies_a:
                movies.append(temp.text)
            try:
                self.vertical_scroll_to()
                next_page = self.until_presence_of_element_located_by_css_selector("div.article > div.paginator > span.next > a", timeout=5)
                next_page.click()
                time.sleep(1)
            except Exception:
                self.close_curr_page()
                return movies

    def get_comments(self, url="", movie_name="", movie_id=None):
        """
        获取单页的20条评论信息
        :param url:
        :param movie_name:
        :return:
        """
        self.fast_new_page(url=url)
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return
        comments_list = self.until_presence_of_all_elements_located_by_css_selector("div.article > div#comments.mod-bd > div.comment-item")
        if not self.judge_web_element_exist_by_css_selector(ele=comments_list[0], css_selector="div.comment"):
            self.close_curr_page()
            return
        for temp in comments_list:
            self.scroll_to_center(temp)
            data = {}
            commenter_name = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > a", ele=temp)
            commenter_useful = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-vote > span.votes", ele=temp)
            comment_content = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > p > span.short", ele=temp)
            comment_time = self.until_presence_of_element_located_by_css_selector(css_selector="div.comment > h3 > span.comment-info > span.comment-time", ele=temp)
            data.setdefault("movie_name", movie_name)
            data.setdefault("nickname", commenter_name.text)
            data.setdefault("useful", commenter_useful.text)
            data.setdefault("time", comment_time.text)
            data.setdefault("content", comment_content.text)
            data.setdefault("comment_from", "douban.com")
            if movie_id is not None:
                data.setdefault("movie_id", movie_id)
            if self.judge_web_element_exist_by_css_selector(ele=temp, css_selector="div.comment > h3 > span.comment-info > span.rating"):
                commenter_evaluate = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.comment > h3 > span.comment-info > span.rating", ele=temp)
                data.setdefault("evaluate", commenter_evaluate.get_attribute("title"))
            else:
                data.setdefault("evaluate", "")
            # self.comment_col.insert_one(data)
        self.close_curr_page()

    def get_one_movie_info(self, ele=None):
        """
        获取电影详细数据
        :param url:
        :return:
        """
        self.fast_click_page_by_elem(ele=ele)
        time.sleep(1)
        # self.fast_new_page(url=url)
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return None
        try:
            actor_more = self.driver.find_element_by_css_selector("div#info > span.actor > span.attrs > a.more-actor")
            actor_more.click()
            mask = 1
        except Exception:
            mask = 0
        div_info = self.until_presence_of_element_located_by_css_selector(css_selector="div#info")
        infos = div_info.text
        info_list = infos.split("\n")
        movie_info = {}
        for info in info_list:
            info = info.split(":")
            key = info[0].strip()
            if len(info) == 1 or (len(info) == 2 and info[1] == ""):
                continue
            elif len(info) > 2:
                value = ":".join(info[1:])
            else:
                value = info[1]
            if key == "官方网站":
                movie_info.setdefault(key, value.strip())
            else:
                movie_info.setdefault(key, [item.strip() for item in value.split("/")])
        # member_link = self.until_presence_of_all_elements_located_by_css_selector(css_selector="span span.attrs a",
        #                                                                     ele=div_info)
        # if mask == 1:
        #     member_link = member_link[:-1]
        # for item in member_link:
        #     item_link = item.get_attribute("href")
        #     if item_link in self.member_set:
        #         continue
        #     self.member_set.add(item_link)
        #     actor_info = {"member_name": item.text, "douban_url": item_link}
        #     self.dataQueue.put(actor_info)
        # self.close_curr_page()
        comment1 = self.until_presence_of_element_located_by_css_selector(
            "div#comments-section > div.mod-hd > h2 > span.pl > a")
        comment2 = self.until_presence_of_element_located_by_css_selector(
            "section#reviews-wrapper > header > h2 > span.pl > a")
        comment_number = int(re.findall(r'\d+', comment1.text)[0]) + int(re.findall(r'\d+', comment2.text)[0])
        movie_info.setdefault("豆瓣评论数量", comment_number)
        self.close_curr_page()
        return movie_info

    def get_movie_infos(self, spider_id, user_id, repo_id, spider_name):
        self.fast_new_page(
            url="https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")
        self.driver.refresh()
        if "页面不存在" in self.driver.title or "条目不存在" in self.driver.title:
            self.close_curr_page()
            return None
        # category_ul = self.until_presence_of_element_located_by_css_selector("ul.category")
        # category = self.until_presence_of_all_elements_located_by_css_selector(css_selector="li", ele=category_ul)[5:]
        # cur = 0
        # description = category[cur].text
        # category[cur].click()
        time.sleep(1)
        css_selector = "div.list-wp a.item"
        elements_list = self.until_presence_of_all_elements_located_by_css_selector(css_selector=css_selector)
        final_result = []
        for each in elements_list:
            data = {}
            self.vertical_scroll_to()
            time.sleep(1)
            self.scroll_to_center(ele=each)
            movie_link = each.get_attribute("href")
            movie_name = self.until_presence_of_element_located_by_css_selector(ele=each,
                                                                                css_selector="div.cover-wp > img")
            movie_score = self.until_presence_of_element_located_by_css_selector(ele=each,
                                                                                 css_selector="p > strong")
            data.setdefault("电影名", movie_name.get_attribute("alt"))
            data.setdefault("豆瓣评分", movie_score.text)
            crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            data.setdefault("crawl_from", movie_link)
            data.setdefault("crawl_time", crwal_time)
            movie_info = self.get_one_movie_info(ele=each)
            movie_info.update(data)
            print(movie_info)
            final_result.append(movie_info)

        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(create_time=timezone.now(),
                                                                      data_source_name=spider_name,
                                                                      data_access="爬虫",
                                                                      repo_id=int(repo_id),
                                                                      create_id=int(user_id),
                                                                      data_path="")
        TEntityExtractionLog.objects.create(data_acquisition_id=one_data_acquisition_log.id, is_extract=0,
                                            entity_number=0, extract_time=timezone.now(), create_id=int(user_id),
                                            repo_id=int(repo_id))

        for item in final_result:
            judge_result = self.judge_data_exist_by_keys(collection=self.movie_col,
                                                         keys={"user_id": user_id, "repo_id": repo_id,
                                                               "value.电影名": item["电影名"],
                                                               "value.crawl_from": item["crawl_from"]})
            if judge_result is True:
                self.movie_col.insert_one(
                    {"file_id": one_data_acquisition_log.id, "category_id": -1, "spider_id": int(spider_id),
                     "user_id": int(user_id), "repo_id": int(repo_id), "value": item})

    # def run(self):
    #     """
    #     单个线程启动方法,对每一个队列中的数据的url进行解析,找到对应的方法进行爬取对应数据
    #     :return:
    #     """
    #     self.info_log(data="线程启动", name=self.name)
    #     count = 0
    #     while not self.dataQueue.empty() and count == 0:
    #         temp = self.dataQueue.get(False)
    #         url_path = urlparse(temp["douban_url"]).path
    #         while True:
    #             try:
    #                 if "/celebrity" in url_path:
    #                     # 获取一条电影人详细数据
    #                     member_info = self.get_member_info(temp["douban_url"])
    #                     if member_info is None:
    #                         print("人物数据不存在")
    #                         break
    #                     member_awards = self.get_member_awards(temp["douban_url"] + "awards")
    #                     member_movies = self.get_member_movies(temp["douban_url"] + "movies")
    #                     member_info.setdefault("awards", member_awards)
    #                     member_info.setdefault("acting_movies", member_movies)
    #                     self.member_col.insert_one(member_info)
    #                     self.info_log(data="成功获取并存储一条人物数据-----" + member_info["member_name"], name=self.threadName)
    #                 elif "/subject" in url_path and "/subject_search" not in url_path and "/comments" not in url_path:
    #                     # 获取一条电影数据,成功获取电影数据后将他的影评url数据压入队列
    #                     movie_info = self.get_movie_info(temp["douban_url"])
    #                     if movie_info is None:
    #                         print("电影数据不存在")
    #                         break
    #                     movie_info.update(temp)
    #                     self.movie_col.insert_one(movie_info)
    #                     self.info_log(data="成功获取并存储一条电影数据-----" + movie_info["movie_name"], name=self.threadName)
    #                     print(movie_info)
    #                     comments_url = temp["douban_url"] + "comments?start=0&limit=20&sort=new_score&status=P"
    #                     self.dataQueue.put({"movie_name": temp["movie_name"], "douban_url": comments_url, "movie_id": movie_info["_id"]})
    #                 elif "/subject" in url_path and "/comments" in url_path:
    #                     # 对url解析,爬取200条影评数据
    #                     bits = list(urlparse(temp["douban_url"]))
    #                     qs = parse_qs(bits[4])
    #                     start = int(qs["start"][0])
    #                     while start <= 200:
    #                         qs["start"][0] = start
    #                         bits[4] = urlencode(qs, True)
    #                         temp["douban_url"] = urlunparse(bits)
    #                         self.get_comments(temp["douban_url"], temp["movie_name"], temp["movie_id"])
    #                         start += 20
    #                 count = 0
    #                 break
    #             except Exception:
    #                 # 累计失败次数,每次失败后更换换代理ip,若连续失败5次则线程结束
    #                 count += 1
    #                 if count > 5:
    #                     self.dataQueue.put(temp)
    #                     break
    #                 self.change_ip(self.get_ip(self.proxy_ip_from))

    @staticmethod
    def get_data_source():
        """
        获取已获取的电影人url
        :return:
        """
        member_col = Mongodb(db='movies', collection='member').get_collection()
        url_set = set()
        for item in member_col.find():
            url_set.add(item["douban_url"])
        return url_set
Ejemplo n.º 5
0
class MtimeSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 isloadimages=True,
                 isproxy=False,
                 spider_id='2'):
        Driver.__init__(self,
                        log_file_name=spider_id,
                        ismobile=ismobile,
                        isvirtualdisplay=isvirtualdisplay,
                        isheadless=isheadless,
                        isloadimages=isloadimages,
                        isproxy=isproxy)
        self.collection = Mongodb(db='knowledge',
                                  collection='text').get_collection()

    def get_news_from_one_page(self, ele=None):
        if ele is None:
            return None
        self.fast_click_page_by_elem(ele=ele)
        # self.fast_new_page(url)
        time.sleep(1)
        if self.judge_web_element_exist_by_css_selector(
                css_selector="p.newsinnerpageall > span > a"):
            show_all_page_btn = self.until_presence_of_element_located_by_css_selector(
                css_selector="p.newsinnerpageall > span > a")
            show_all_page_btn.click()
        try:
            news_title = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsheader > div.newsheadtit").text
            news_time = re.findall(
                r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.newsheader > p.newstime").text)[0]
            news_source = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsheader > p.newstime > span.ml15"
            ).text.split(":")[1]
            news_content = self.until_presence_of_element_located_by_css_selector(
                css_selector="div.newsnote").get_attribute(
                    'innerHTML'
                ) + self.until_presence_of_element_located_by_css_selector(
                    css_selector="div#newsContent").get_attribute("innerHTML")
            news_author = \
            self.until_presence_of_element_located_by_css_selector(css_selector="p.newsediter").text.split(
                ":")[1]
        except Exception:
            return None
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        one_news = {}
        one_news.setdefault("标题", news_title)
        one_news.setdefault("时间", news_time)
        one_news.setdefault("来源", news_source)
        one_news.setdefault("内容", news_content)
        one_news.setdefault("作者", news_author)
        one_news.setdefault("crawl_from", self.get_current_url())
        one_news.setdefault("crwal_time", crwal_time)
        self.close_curr_page()
        return one_news

    def get_news_infos(self, spider_id, user_id, repo_id, spider_name):
        url = "http://news.mtime.com/movie/1/"
        self.fast_new_page(url=url)
        time.sleep(1)
        final_result = []
        flag = 0
        while True:
            while self.judge_web_element_exist_by_css_selector(
                    css_selector="div.newscontent > div#leftNews > a#viewmore"
            ):
                more_info_btn = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div.newscontent > div#leftNews > a#viewmore")
                self.scroll_to_center(more_info_btn)
                more_info_btn.click()
                time.sleep(1)
            news_list = self.until_presence_of_all_elements_located_by_css_selector(
                css_selector="ul#newslist > li")
            for item in news_list:
                one_news = self.get_news_from_one_page(ele=item)
                if one_news is None:
                    continue
                print(one_news)
                judge_result = self.judge_data_exist_by_keys(
                    collection=self.collection,
                    keys={
                        "user_id": user_id,
                        "repo_id": repo_id,
                        "value.crawl_from": one_news["crawl_from"]
                    })
                if judge_result:
                    final_result.append(one_news)
                else:
                    flag = 1
                    break
            if flag == 1 or not self.judge_web_element_exist_by_css_selector(
                    css_selector="div#pages > a.cur + a"):
                break
            else:
                next_page_btn = self.until_presence_of_element_located_by_css_selector(
                    css_selector="div#pages > a.cur + a")
                self.fast_click_page_by_elem(ele=next_page_btn)
                time.sleep(1)
        if len(final_result) == 0:
            return
        one_data_acquisition_log = TDataAcquisitionLog.objects.create(
            create_time=timezone.now(),
            data_source_name=spider_name,
            data_access="爬虫",
            repo_id=int(repo_id),
            create_id=int(user_id),
            data_path="")
        TEntityExtractionLog.objects.create(
            data_acquisition_id=one_data_acquisition_log.id,
            is_extract=0,
            entity_number=0,
            extract_time=timezone.now(),
            create_id=int(user_id),
            repo_id=int(repo_id))

        for item in final_result:
            self.collection.insert_one({
                "file_id": one_data_acquisition_log.id,
                "category_id": -1,
                "spider_id": int(spider_id),
                "user_id": int(user_id),
                "repo_id": int(repo_id),
                "value": item
            })