コード例 #1
0
    def parse(self, response):
        receive_time = time.time()

        if response.status == 200:
            session = sql_manager.get_sql_curser()
            movie_list = response.xpath('//div[@id="wrapper"]//div[@id="main"]//div[@class="lister-item mode-simple"]')
            if len(movie_list)>0:
                movie_list_sql = []
                for movie in movie_list:
                    title = movie.xpath('.//span[@class="lister-item-header"]/span[2]/a/text()')
                    url = movie.xpath('.//span[@class="lister-item-header"]/span[2]/a/@href')
                    if len(title)>0 and len(url)>0:
                        movie_list_sql.append(IMBD_movie_url_SQL(title=title[0].extract(), url=url[0].extract()))

                session.add_all(movie_list_sql)
                session.commit()

            session.add(IMBD_spider_SQL(url=response.url, status=str(response.status)))
            session.commit()

            self.index += 1
            print('%d successfully crawl %s receive_time:%f'
                  %(self.index , response.url, receive_time-response.meta['time']))
            sql_manager.release_sql_curser(session)

            if self.index<=100:
                next_page = response.xpath('//body//a[@class="lister-page-next next-page"]')
                if next_page:
                    next_page_url = self.domains_name + next_page[0].xpath('./@href')[0].extract()
                    print('new url:%s'%next_page_url)
                    yield scrapy.Request(url=next_page_url, callback=self.parse, errback=self.error_back, meta={'time':time.time()})
                else:
                    print('no new url')
            else:
                print('too much pages')
コード例 #2
0
    def __init__(self):
        super(DetailSpider, self).__init__()
        self.index = 0

        session = sql_manager.get_sql_curser()
        self.tid_list = session.query(IMBD_movie_url_SQL).all()
        sql_manager.release_sql_curser(session)

        print('init finish')
コード例 #3
0
ファイル: userRating.py プロジェクト: bladesaber/COMP7630
    def parse(self, response):
        self.index += 1

        if response.status == 200:
            try:
                tid = response.meta['tid']
                item_list, user_list = self.get_review_list(response, tid)

                session = sql_manager.get_sql_curser()
                if len(item_list) > 0:
                    session.add_all(item_list)
                    # session.commit()

                # if len(user_list):
                #     for item in user_list:
                #         session.merge(item)
                #     session.commit()

                session.add(
                    IMBD_spider_SQL(url=response.url,
                                    status=str(response.status),
                                    remark=tid))
                session.commit()
                sql_manager.release_sql_curser(session)

                next_url = self.get_next_url(response, tid=tid)
                if next_url != '':
                    yield scrapy.Request(url=next_url,
                                         callback=self.parse,
                                         errback=self.error_back,
                                         meta={'tid': tid})

                print('%d successfully crawl %s' % (self.index, response.url))

            except Exception as e:
                session = sql_manager.get_sql_curser()
                session.add(
                    IMBD_spider_SQL(url=response.url,
                                    status=str(999),
                                    remark=response.meta['tid']))
                session.commit()
                sql_manager.release_sql_curser(session)
                print(e)
                print('error happen %s' % response.url)
コード例 #4
0
ファイル: userRating.py プロジェクト: bladesaber/COMP7630
    def error_back(self, response):
        print('fail crawl:', response.value.response.url)

        session = sql_manager.get_sql_curser()
        session.add(
            IMBD_spider_SQL(url=response.value.response.url,
                            status=str(response.value.response.status),
                            remark=response.meta['tid']))
        session.commit()
        sql_manager.release_sql_curser(session)
コード例 #5
0
ファイル: userRating.py プロジェクト: bladesaber/COMP7630
    def __init__(self, start_idx=0, end_idx=0):
        super(UserratingSpider, self).__init__()
        self.index = 0
        # self.start_idx = start_idx
        # self.end_idx = end_idx

        session = sql_manager.get_sql_curser()
        self.tid_list = session.query(IMBD_movie_url_SQL).all()
        sql_manager.release_sql_curser(session)
        self.tid_list = self.tid_list[5000:10000]

        print('init finish')
コード例 #6
0
    def parse_keywords(self, response):
        try:
            receive_time = time.time()
            self.index += 1

            key_words = '|'.join(
                response.xpath('//div[@id="keywords_content"]/table//td/@data-item-keyword').extract()).strip()

            session = sql_manager.get_sql_curser()

            session.add(IMBD_keyword_SQL(
                tid=response.meta['tid'],
                keyword=key_words
            ))
            session.commit()

            session.add(IMBD_spider_SQL(
                url=response.url, status=str(response.status), remark=response.meta['tid']
            ))
            session.commit()
            sql_manager.release_sql_curser(session)

            print('%d successfully crawl %s receive_time:%f' % (
                self.index, response.url, receive_time - response.meta['time']))
        except Exception as e:
            session = sql_manager.get_sql_curser()
            session.add(IMBD_spider_SQL(
                url=response.url,
                status=str(999),
                remark=response.meta['tid']
            ))
            session.commit()
            sql_manager.release_sql_curser(session)
            print(e)
            print('error happen %s' % response.url)
            print('-------------------------------')
コード例 #7
0
    def parse_detail(self, response):
        receive_time = time.time()
        self.index += 1

        if response.status == 200:
            try:
                title = response.xpath('//body//div[@class="title_wrapper"]/h1/text()')
                title = title[0].extract().replace('\n', '').strip() if len(title) > 0 else None

                grading = response.xpath('//body//div[@class="title_wrapper"]/div[@class="subtext"]/text()')
                grading = grading[0].extract().replace('\n', '').strip() if len(grading) > 0 else None

                release_date = response.xpath('//span[@id="titleYear"]/a/text()')
                release_date = int(release_date[0].extract()) if len(release_date) > 0 else None

                rating = response.xpath('//div[@class="ratings_wrapper"]//span[@itemprop="ratingValue"]/text()')
                rating = float(rating[0].extract()) if len(rating) > 0 else None

                vote = response.xpath('//div[@class="ratings_wrapper"]//span[@itemprop="ratingCount"]/text()')
                vote = int(vote[0].extract().replace(',', '')) if len(vote) > 0 else None

                genres, rumtime, language, country, key_words = None, None, None, None, None
                director, writer, other_names = None, None, None
                production_company, cast = None, None
                for cell in response.xpath('//div[@id="titleDetails"]//div[@class="txt-block"]'):
                    if len(cell.xpath('./h4/text()')) > 0:
                        if cell.xpath('./h4/text()')[0].extract().replace(':', '') == 'Runtime':
                            rumtime = int(cell.xpath('./time/text()')[0].extract().replace('min', '').strip())

                        if cell.xpath('./h4/text()')[0].extract().replace(':', '') == 'Country':
                            country = '|'.join(cell.xpath('./a/text()').extract()).replace('\n', '').strip()

                        if cell.xpath('./h4/text()')[0].extract().replace(':', '') == 'Language':
                            language = '|'.join(cell.xpath('./a/text()').extract()).replace('\n', '').strip()

                        # if cell.xpath('./h4/text()')[0].extract().replace(':', '') == 'Also Known As':
                        #     other_names = ' '.join(cell.xpath('./text()').extract())
                        #     other_names = other_names.replace('\n', '').strip()

                        if cell.xpath('./h4/text()')[0].extract().replace(':', '') == 'Production Co':
                            production_company = '|'.join(cell.xpath('./a/text()').extract()).replace('\n', '').replace(
                                '| ', '|').strip()

                for cell in response.xpath('//div[@id="titleStoryLine"]//div[@class="see-more inline canwrap"]'):
                    if len(cell.xpath('./h4/text()')) > 0:
                        if 'Genre' in cell.xpath('./h4/text()')[0].extract().replace(':', ''):
                            genres = '|'.join(cell.xpath('./a/text()').extract()).replace('\n', '').strip()

                        if 'Plot Keyword' in cell.xpath('./h4/text()')[0].extract().replace(':', ''):
                            if len(cell.xpath('./nobr')) > 0:
                                key_words = None
                                # print('having more keywords')
                                keyword_url = 'https://www.imdb.com/title/%s/keywords' % response.meta['tid']
                                yield scrapy.Request(url=keyword_url, callback=self.parse_keywords,
                                                     errback=self.error_back,
                                                     meta={'tid': response.meta['tid'], 'time': time.time()})
                            else:
                                key_words = '|'.join(cell.xpath('./a/span/text()').extract()).replace('\n', '').strip()

                for cell in response.xpath('//div[@class="plot_summary "]//div[@class="credit_summary_item"]'):
                    if len(cell.xpath('./h4/text()')) > 0:
                        if 'Director' in cell.xpath('./h4/text()')[0].extract().replace(':', ''):
                            director = []
                            for a in cell.xpath('./a'):
                                dire = a.xpath('./text()')[0].extract().strip()
                                if 'more' not in dire:
                                    director.append(dire)
                            director = '|'.join(director).replace('\n', '').strip()

                        if 'Writer' in cell.xpath('./h4/text()')[0].extract().replace(':', ''):
                            writer = []
                            for a in cell.xpath('./a'):
                                writ = a.xpath('./text()')[0].extract().strip()
                                if 'more' not in writ:
                                    writer.append(writ)
                            writer = '|'.join(writer).replace('\n', '').strip()

                if len(response.xpath('//div[@id="titleCast"]//tr')) > 1:
                    cast = '|'.join(response.xpath('//div[@id="titleCast"]//tr/td[2]/a/text()').extract()).replace('\n',
                                                                                                                   '').replace(
                        '| ', '|').strip()

                session = sql_manager.get_sql_curser()
                session.add(IMBD_movie_SQL(
                    tid=response.meta['tid'],
                    title=title,
                    Grading=grading,
                    RunTime=rumtime,
                    Genres=genres,
                    Release=release_date,
                    Vote=vote,
                    Rating=rating,
                    Country=country,
                    Language=language,
                    Production=production_company,
                    Director=director,
                    Writer=writer,
                    Cast=cast
                ))
                session.commit()

                if key_words != None:
                    session.add(IMBD_keyword_SQL(
                        tid=response.meta['tid'],
                        keyword=key_words
                    ))
                    session.commit()

                session.add(IMBD_spider_SQL(
                    url=response.url, status=str(response.status), remark=response.meta['tid']
                ))
                session.commit()
                sql_manager.release_sql_curser(session)

                print('%d successfully crawl %s receive_time:%f' % (
                    self.index, response.url, receive_time - response.meta['time']))
            except Exception as e:
                session = sql_manager.get_sql_curser()
                session.add(IMBD_spider_SQL(
                    url=response.url,
                    status=str(999),
                    remark=response.meta['tid']
                ))
                session.commit()
                sql_manager.release_sql_curser(session)
                print(e)
                print('error happen %s' % response.url)
コード例 #8
0
    def parse_summary(self, response):
        try:
            receive_time = time.time()
            self.index += 1

            li_list = response.xpath('//ul[@id="plot-summaries-content"]//li')
            summary_list = []
            summary = []
            for li in li_list:
                if len(li.xpath('./div[@class="author-container"]')) > 0:
                    summary.append(li.xpath('.//p')[0].xpath('string(.)')[0].extract())
                    summary_text = ' '.join(summary).replace('\n', '').strip()
                    summary_list.append(summary_text)
                    summary = []
                else:
                    summary.append(li.xpath('.//p')[0].xpath('string(.)')[0].extract())

            synopsis = response.xpath('//ul[@id="plot-synopsis-content"]/li[@class="ipl-zebra-list__item"]')
            if len(synopsis) > 0:
                synopsis = synopsis[0].xpath('string(.)')[0].extract().replace('\n', '').strip()
            else:
                synopsis = None

            summary_upload_list = []
            for t in summary_list:
                if t != None:
                    summary_upload_list.append(IMBD_summary_SQL(
                        tid=response.meta['tid'],
                        summary=t,
                        type=0
                    ))

            if synopsis != None:
                summary_upload_list.append(IMBD_summary_SQL(
                    tid=response.meta['tid'],
                    summary=synopsis,
                    type=1
                ))

            session = sql_manager.get_sql_curser()
            if len(summary_upload_list) > 0:
                session.add_all(summary_upload_list)
                session.commit()

            session.add(IMBD_spider_SQL(
                url=response.url, status=str(response.status), remark=response.meta['tid']
            ))
            session.commit()
            sql_manager.release_sql_curser(session)

            print('%d successfully crawl %s receive_time:%f' % (
                self.index, response.url, receive_time - response.meta['time']))

        except Exception as e:
            session = sql_manager.get_sql_curser()
            session.add(IMBD_spider_SQL(
                url=response.url,
                status=str(999),
                remark=response.meta['tid']
            ))
            session.commit()
            sql_manager.release_sql_curser(session)
            print(e)
            print('error happen %s' % response.url)
            print('-------------------------------')