Beispiel #1
0
    def fetchOldArticleList(self, channel, articleList, articleCount=100):
        '''
        从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里
        '''
        channel = int(channel)
        # 用来查询总页数
        selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d '
        sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE, channel)
        # 获取旧文章的sql
        selectSql = 'SELECT TID,title, publish_datetime,url, meta_info,like_count,reply_count,forward_count FROM %s where extinct="N" and channel_id=%d '
        sql = selectSql % (Constants.TABLE_SA_ARTICLE, channel)

        if len(articleList) > 0:
            whereClauseList = map(
                lambda article: ' tid<>"%s" ' % (article.tid), articleList)
            whereClauseList = ' and '.join(whereClauseList)
            sql += ' and (%s)' % (whereClauseList)
            sql2 += ' and (%s)' % (whereClauseList)
        sql2 += ' order by add_datetime desc;'
        self.dbProxy.execute(sql2)
        resultList2 = self.dbProxy.fetchall()
        # print '12456789sssssssssssssssssss'
        # print resultList2 #((53,),)
        resultList2 = re.findall(r'\d+', str(resultList2))  # 返回一个list
        # print resultList2[0]
        if int(resultList2[0]) > int(articleCount):
            randpage = random.randint(
                0, int(math.ceil(float(resultList2[0]) / articleCount)))
        else:
            randpage = 0  # 用来随机取数据库页数

        sql += ' order by add_datetime desc limit %d,%d' % (randpage,
                                                            articleCount)
        self.dbProxy.execute(sql)
        resultList = self.dbProxy.fetchall()

        L1 = []
        for item in resultList:
            result = Article(item[0],
                             channel,
                             title=item[1],
                             publish_datetime=item[2],
                             url=item[3],
                             meta_info=item[4])
            result.statistics = ArticleStatistics(item[0],
                                                  channel,
                                                  like_count=item[5],
                                                  reply_count=item[6],
                                                  forward_count=item[7])
            L1.append(result)

        return L1
Beispiel #2
0
 def parse_main(self, response):
     item = response.meta['item']
     item['article'] = response.xpath(
         "//div[@class ='p-right left']//div[@id='p-detail']//p|"
         "//div[@id='content']//p|"
         "//div[@class='content']//p|"
         "//div[@class ='contant clearfix']/div[@class ='xl']//p|"
         "//div[@id ='Content']//p|"
         "//div[@class ='zj_left']/div[@class ='zj_nr']//p|"
         "//td[@class='text_con_16_33']//p|"
         "//div[@class ='content pack']//p|"
         "//div[@class = 'article']//p|"
         "//div[@class ='main-content-box']//p|"
         "//div[@id ='nr_wz']//p").xpath('string(.)').extract()
     item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4]
     yield item
     article = Article(tid=item['TID'],
                       channel_id=11,
                       title=item['title'],
                       content=item['article'],
                       publish_datetime=item['time'],
                       url=item['href'],
                       author_name=item['source'],
                       digest=item['intro'])
     self.r.append(article)
     if len(self.r) == len(self.R):
         print(len(self.r))
         print('爬虫结束,开始热度分析')
         SARunner().article_List(self.r)
    def analysisBefore(self, event_id):
        entityEventDict = dict({'event_id': event_id})
        if entityEventDict is None or self.entity_id not in entityEventDict:
            self.logger.warn('Entity %d not found in system', self.entity_id)
            return []

        eventDict = entityEventDict[self.entity_id]

        articleTableName = Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + self.entity_id
        eventTableName = Constants.TABLE_SA_EVENT + Constants.TABLE_NAME_DELIMITER + self.entity_id
        sqlArticlListBefore = '''
        SELECT a.TID,a.CHANNEL_ID, a.TITLE, a.CONTENT, a.PUBLISH_DATETIME, a.URL, a.AUTHOR_ID, a.AUTHOR_NAME,
        a.PUBLISH_METHOD, a.DIGEST, a.HEAT,
        FROM %s as a, %s as e
        WHERE a.PUBLISH_DATETIME > e.START_DATETIME
        ''' % (articleTableName, eventTableName)
        self.dbProxy.execute(sqlArticlListBefore)
        resultList = self.dbProxy.fetchall()
        articleList = map(
            lambda item: Article(item[0], item[1], item[2], item[3], item[
                4], item[5], item[6], item[7], item[8], item[9], item[10]),
            resultList)

        hitEventList = list()
        for article in articleList:
            hitEventList.append(self.__analysisEvent(article, eventDict))
        return hitEventList
Beispiel #4
0
 def __fetchAllArticleList(self, entity_id, start_time, end_time):
     '''
     获取数据库中某段时间添加的文章
     '''
     selectSql = '''select * 
                     from %s
                     where add_datetime between '%s' and '%s'
     '''
     tableName = Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + entity_id
     sql = selectSql % (tableName, start_time, end_time)
     self.dbProxy.execute(sql)
     results = self.dbProxy.fetchall()
     self.logger.debug(len(results))
     articleList = list()
     for item in results:
         article = Article(tid=item[0],
                           url=item[1],
                           add_datetime=item[2],
                           publish_datetime=item[3],
                           publish_method=item[4],
                           title=item[7],
                           author_id=item[8],
                           author_name=item[9],
                           content=item[11],
                           heat=item[17],
                           channel_id=item[18],
                           entity=entity_id)
         article.statistics.read_count = item[12]
         article.statistics.like_count = item[13]
         article.statistics.reply_count = item[14]
         article.statistics.forward_count = item[15]
         article.statistics.collect_count = item[16]
         articleList.append(article)
     return articleList
Beispiel #5
0
 def parse_main(self, response):
     item = RMWspider1Item()
     item['title'] = response.meta['title'][0]
     item['time'] = response.meta['time']
     item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace(
         ']',
         '',
     )
     item['href'] = response.meta['href']
     item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5]
     if 'people' in item['TID']:
         item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:]
     item['source'] = response.xpath(
         "//div[@class = 'artOri']/a/text()|"
         "//div[@class='box01']//a/text()|"
         "//div[@class='text_c']/p//a/text()|"
         "//div[@class = 'msgBox']//a/text()|"
         "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|"
         "//div[@class = 'w1000 p2']//a/text()|"
         "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first()
     item['article'] = response.xpath(
         "//div[@id='rwb_zw']//p|"
         "//div[@class='show_text']//p|"
         "//div[@class='artDet']//p|"
         "//div[@class='text_con clearfix']//p|"
         "//div[@class = 'content clear clearfix']//p|"
         "//div[@id = 'p_content']//p|"
         "//div[@class = 'box_con']//p|"
         "//div[@class = 'text_show']//p|"
         "//div[@class = 'gray box_text']//p|"
         "//div[@class = 'text_box clearfix']//p").xpath(
             'string(.)').extract()
     yield item
     article = Article(tid=item['TID'],
                       channel_id=5,
                       title=item['title'],
                       content=item['article'],
                       publish_datetime=item['time'],
                       url=item['href'],
                       author_name=item['source'],
                       digest=item['intro'])
     self.r.append(article)
     if len(self.R) == len(self.r):
         print(len(self.r))
         print('爬虫结束,开始热度分析')
         SARunner().article_List(self.r)
Beispiel #6
0
    def filterRemovedArticle(self, articleList, entityId, eventId=None):
        '''
        与remove表格对比,进行文章过滤
        返回不存在remove表中的文章list
        '''
        if len(articleList) == 0:
            return []
        if eventId is not None:
            tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ' event_id=%d and ' % eventId

            start_datetime, end_datetime = self.fetchEventTime(
                entityId, eventId)

            # 过滤掉不在该事件开始时间和结束之间内的文章
            article_new_list = list()
            for article in articleList:
                if (str(article.publish_datetime) > str(start_datetime)) and (
                        str(article.publish_datetime) < str(end_datetime)):
                    article_new_list.append(article)

            articleList = article_new_list

        else:
            tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ''
        # 在remove表里查找文章
        selectSql = '''
            SELECT TID, CHANNEL_ID FROM %s where %s (%s)
        '''
        whereClauseList = map(
            lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
            (article.tid, article.channel_id), articleList)

        self.dbProxy.execute(
            selectSql %
            (tableName, eventCondition, ' or '.join(whereClauseList)))
        resultList = self.dbProxy.fetchall()  # 查询返回结果集
        removedArticleList = map(lambda x: Article(x[0], x[1]), resultList)
        filteredArticle = filter(lambda x: x not in removedArticleList,
                                 articleList)
        return filteredArticle
Beispiel #7
0
    def seperateNewOldArticles(self, articleList, entityId=None):
        '''
        查询全局文章表,区分新文章和旧文章
        '''
        if len(articleList) == 0:
            return ([], [])
        if entityId is None:
            selectSql = 'select tid, channel_id from %s where ' % Constants.TABLE_SA_ARTICLE
        else:
            selectSql = 'select tid, channel_id from %s where ' % (
                Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER +
                entityId)
        whereClauseList = map(
            lambda article: '(tid="%s" and channel_id=%d)' %
            (article.tid, article.channel_id), articleList)
        self.dbProxy.execute(selectSql + ' or '.join(whereClauseList))
        resultList = map(lambda x: Article(x[0], x[1]),
                         self.dbProxy.fetchall())

        existingArticleList = filter(lambda x: x in resultList, articleList)
        newArticleList = filter(lambda x: x not in resultList, articleList)
        return (existingArticleList, newArticleList)
Beispiel #8
0
    def crawlNewsArticle(self, url):
        '''
        爬取url前缀为news.qq.com和gd.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('news.qq.com') < 0 and article_url.find(
                    'gd.qq.com') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNews]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'id': "Main-Article-QQ"})
            main1 = soup.find('div', attrs={'id': "Main-P-QQ"})
            if main is not None:
                Ttitle = main.find('h1').text.strip()  #标题
                Ttime = main.find('span', attrs={'class':
                                                 "article-time"})  #发布时间
                Ttime1 = main.find('span', attrs={'class': "a_time"})
                Ttime2 = main.find('span', attrs={'class': "pubTime"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                elif Ttime1 is not None:
                    Ttime1 = Ttime1.text.strip()
                    Ttime = Ttime1
                elif Ttime2 is not None:
                    Ttime2 = Ttime2.text.strip()
                    Ttime = Ttime2
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                Tauthor = main.find('span', attrs={'class': "a_source"})
                Tauthor1 = main.find('span', attrs={'class': "color-a-1"})
                if Tauthor is not None:
                    #Tauthor = Tauthor.find('a').text.strip()
                    Tauthor = Tauthor.text.strip()
                elif Tauthor1 is not None:
                    #Tauthor1 = Tauthor1.find('a').text.strip()
                    Tauthor1 = Tauthor1.text.strip()
                    Tauthor = Tauthor1
                else:
                    Tauthor = None
                Tcontent = main.find('div',
                                     attrs={'id': "Cnt-Main-Article-QQ"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                if commentid is not None:
                    try:
                        re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum'
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data']['commentnum'])
                    except Exception:
                        traceInfo = traceback.format_exc()
                        self.logger.error(
                            'Faile to parse comment for %s (cid=%s):%s',
                            articleid, commentid, traceInfo)
                        Treply = None
                    article.statistics.reply_count = Treply
                return article
            elif main1 is not None:
                Ttitle = soup.find('meta', attrs={
                    'name': "Description"
                }).attrs['content']  # 标题
                Ttime = re.findall(
                    r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',",
                    html['html'])
                if Ttime is not None:
                    Ttime = Ttime[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[
                        2] + ' ' + Ttime[3]
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'
                Tauthor = re.findall(r'para = {\s+name: \"(.*)\",',
                                     html['html'])
                if Tauthor is not None:
                    Tauthor = Tauthor[0]
                else:
                    Tauthor = None
                con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url)
                con_html = self.session.download(con_url,
                                                 encoding='gbk',
                                                 data=None,
                                                 timeout=10,
                                                 retry=3)
                con_list = re.findall(r'<p>(.*?)</p>', con_html)
                if con_list is not None:
                    TT = []
                    for i in con_list:
                        if i.strip() not in TT:
                            TT.append(i)
                    Tcontent = ''.join(TT)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                try:
                    if commentid is not None:
                        re_url = 'http://coral.qq.com/article/batchcommentnum'
                        data1 = {'targetid': articleid}
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=data1,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data'][0]['commentnum'])
                    else:
                        Treply = None
                except:
                    Treply = None
                article.statistics.reply_count = Treply
                return article
        return None
Beispiel #9
0
    def crawlNewArticle(self, url):
        '''
        爬取url前缀为new.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('new.qq.com/omn') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return

            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNew]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')

            script_tags = soup.head.find_all('script')
            data = dict()
            for tag in script_tags:
                text = re.search(r'window.DATA = (.*)', tag.text, re.S)
                if text:
                    data = json.loads(text.group(1))

            tid = data['article_id']
            title = data['title']
            author_name = data['media']
            author_id = data['media_id']
            publish_datetime = data['pubtime']
            comment_id = data['comment_id']

            main = soup.find('div', attrs={'class': 'qq_conent clearfix'})
            t_content = ''
            if main is not None:
                contents = main.find_all('p', {'class': 'one-p'})
                for content in contents:
                    if content.string is None:
                        continue
                    t_content += str(content.get_text().strip())

            get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % (
                comment_id, comment_id)
            comment_data = self.session.download(get_comment_count_url)
            comment_data = re.search(
                r'_article%scommentnum\((.*)\)' % comment_id, comment_data)

            comment_dict = eval(comment_data.group(1))
            reply_count = comment_dict['data']['commentnum']
            meta_info = '{"commentid":"%s"}' % comment_id

            article = Article(tid=tid,
                              channel_id=self.channel.channel_id,
                              title=title,
                              content=t_content,
                              publish_datetime=publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info=meta_info)
            article.statistics.reply_count = reply_count
            return article
        return None
Beispiel #10
0
    def parse_info(self, response):
        weibo_list = response.xpath("//div[@class='c' and @id]")
        for weibo in weibo_list:
            item = Weibospider1Item()
            div = weibo.xpath("./div")
            if len(div) == 1:
                # 微博类型
                item["category"] = "无图原创"
                item["author"] = weibo.xpath(
                    "./div/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item["content"] = weibo.xpath(
                    "./div/span[@class='ctt']").xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div/text()|./div/span[@class='ctt']//text()"
                    ).extract()
                item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div/a[@class='cc']/@href").extract_first()
                item["send_time"] = weibo.xpath(
                    "./div/span[@class='ct']/text()").extract_first()
                item["reason"] = None
                item["img_url"] = None
                item['reason_name'] = None
                item['reason_id'] = None

            elif len(div) == 2:
                item["category"] = ""
                item["content"] = weibo.xpath("./div[1]/span[@class='ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[2]/a[@class='cc']/text()").extract_first()
                item["reason"] = None
                img = weibo.xpath("./div[2]//img[@class='ib']/@src")
                if len(img) == 0:
                    # 无图转发
                    item['category'] = "无图转发"
                    item["author"] = weibo.xpath(
                        "./div/span[@class = 'cmt']/a/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/text()"
                    ).extract_first()
                    item['reason_id'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/@href").extract_first(
                        )
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["reason"] = weibo.xpath(
                        "./div[2]/text()|./div[2]//span[@class='kt']/text()"
                    ).extract()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["img_url"] = None
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

                else:
                    # 有图原创
                    item['category'] = "有图原创"
                    item["author"] = weibo.xpath(
                        "./div/a[@class='nk']/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = None
                    item['reason_id'] = None
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["img_url"] = weibo.xpath(
                        "./div[2]//img[@class='ib']/@src").extract_first()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

            else:
                # len(div) == 3
                item["category"] = "带图片转发"
                item["author"] = weibo.xpath(
                    "./div[1]/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item['reason_name'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/text()").extract_first()
                item['reason_id'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/@href").extract_first()
                item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["send_time"] = weibo.xpath(
                    "./div[3]/span[@class='ct']/text()").extract_first()
                item["dianzan"] = weibo.xpath(
                    "./div[3]/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/@href").extract_first()
                item["img_url"] = weibo.xpath(
                    "./div[2]//img[@class='ib']/@src").extract_first()
                item["reason"] = weibo.xpath(
                    "./div[3]/text()|./div[3]//span[@class='kt']/text()"
                ).extract()
            item['relay_url'] = ''

            item['TID'] = re.findall(r'uid=.{1,}&',
                                     item["comment_url"])[0][4:-1]
            a = weibo.xpath("//a[@class='nk']/@href").extract()
            yield item
            article = Article(tid=item['TID'],
                              channel_id=9,
                              content=item['content'],
                              publish_datetime=item['send_time'],
                              url=item['comment_url'],
                              title=item['content'][0:100],
                              author_id=item['author_id'],
                              author_name=item['author'])
            article.statistics = ArticleStatistics(
                tid=item['TID'],
                channel_id=9,
                reply_count=item['comment'],
                forward_count=item['relay'],
                like_count=item['dianzan'],
            )
            if int(item['relay']) > 0:
                self.relay_url_list.append(item['relay_url'])

            self.r.append(article)
            self.name_url_list.append(a)

        num_page = response.xpath(
            "//div[@id='pagelist']/form/div/text()").extract()
        num_page = [i.replace(
            u"\xa0",
            "",
        ) for i in num_page]
        num_page = [i for i in num_page if len(i) > 0][0]
        num_page = re.findall(r'\d+', num_page)

        print('正在爬取第', num_page[0], '页', num_page[1])
        max_page = NUM_PAGE
        if max_page is None:
            max_page = int(num_page[1])
        if int(num_page[0]) == max_page:
            L = []
            for L1 in self.name_url_list:
                L += L1
            for url_1 in L:
                with open(os_file.a + '\\crawler_url.txt',
                          'a',
                          encoding='utf-8') as f:
                    f.write(url_1 + "\n")

            print('页数上限,搜索页数据爬取完毕')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.r)

            print("爬取微博数:", len(self.r))
            # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
            # 爬取作者头像 id 关注 粉丝
            with open(os_file.a + '\\crawler_url.txt', 'r',
                      encoding='utf-8') as f:
                urls = f.readlines()
                # 获取待爬个数
                # 去重
                L2 = {}.fromkeys(urls).keys()
                self.L2 = len(L2)
                print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
                for url in L2:
                    yield scrapy.FormRequest(url=url,
                                             callback=self.parse_info_detail,
                                             dont_filter=True)
        else:
            next_url = response.xpath(
                "//a[text() = '下页']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)
            yield scrapy.Request(next_url,
                                 callback=self.parse_info,
                                 dont_filter=True)
Beispiel #11
0
    def parse_main(self, response):
        item = XinLangspider1Item()
        item['intro'] = str(response.meta["intro"]).replace(
            u"...",
            "",
        ).replace(
            u"']",
            "",
        ).replace(
            u"['",
            "",
        )
        item['href'] = response.meta["href"]
        item['time'] = response.meta['time']
        item['title_main'] = response.meta['title']
        item['article'] = response.xpath(
            "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()"
        ).extract()
        item['source'] = response.xpath(
            "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()"
        ).extract()
        item['TID'] = None

        a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5]
        a = a.replace(
            u"/",
            "",
        )

        if a in 'k':
            item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1]
        else:
            item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6]

        if a in xw_type.cs:
            item['source'] = response.xpath(
                "//span[@id = 'art_source']/text()").extract()
            item['article'] = response.xpath(
                "//div[@class = 'article-body main-body']//p//text()").extract(
                )
        elif a in xw_type.ss:
            item['source'] = response.xpath(
                "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()"
            ).extract()
        elif a in xw_type.xw:
            item['article'] = response.xpath("//div[@id = 'article']").xpath(
                'string(.)').extract()
            item['source'] = response.xpath(
                "//a[@class = 'source']/text()").extract()
        elif a in xw_type.bk:
            item['source'] = '新浪博客'
            item['article'] = response.xpath(
                "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font"
            ).xpath('string(.)').extract()

        # 手机版网站
        if len(item['article']) == 0 and len(item['source']) == 0:
            item['article'] = response.xpath(
                "//section[@class = 'art_pic_card art_content']/p//text()"
            ).extract()
            item['source'] = response.xpath(
                "//h2[@class ='weibo_user']/text()").extract()

        yield item
        article = Article(tid=item['TID'],
                          channel_id=3,
                          title=item['title_main'],
                          content=item['article'],
                          publish_datetime=item['time'],
                          url=item['href'],
                          author_name=item['source'],
                          digest=item['intro'])

        self.R.append(article)
        if len(self.r) == len(self.R):
            print(len(self.R))
            print('开始保存数据库')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.R)
Beispiel #12
0
 def updateOldArticleToArticleHistoryTable(self,
                                           articleList,
                                           currentTableName,
                                           historyTableName,
                                           isEventTable=False):
     '''
     更新到文章历史表
     @param currentTableName: 当前文章表:全局文章表、实体文章表或者实体事件文章表
     @param historyTableName: 历史文章表:全局文章表、实体文章表或者实体事件文章表
     @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None
     '''
     articleList = list(articleList)
     if len(articleList) > 0:
         if isEventTable is False:
             eventIdFieldName = ''
         else:
             eventIdFieldName = ',EVENT_ID'
         #找寻老文章
         selectSql = '''
         SELECT TID, CHANNEL_ID %s FROM %s where %s
         '''
         whereClauseList = map(
             lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
             (article.tid, article.channel_id), articleList)
         self.dbProxy.execute(selectSql %
                              (eventIdFieldName, currentTableName,
                               ' or '.join(whereClauseList)))
         resultList = self.dbProxy.fetchall()
         if isEventTable:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1], eventId=item[2]),
                 resultList)
         else:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1]), resultList)
         toBeUpdateArticleList = list()
         for item in existingArticleList:
             index = articleList.index(item)
             obj = copy.copy(articleList[index])
             obj.eventId = item.eventId
             toBeUpdateArticleList.append(obj)
         if len(toBeUpdateArticleList) > 0:
             n = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             if isEventTable is False:
                 eventIdFieldName = ''
             else:
                 eventIdFieldName = 'EVENT_ID,'
             insertSql = '''
             INSERT INTO %s (TID, %s CHANNEL_ID,
                 READ_COUNT,LIKE_COUNT, REPLY_COUNT,
                 FORWARD_COUNT, COLLECT_COUNT, HEAT, ADD_DATETIME)
             VALUES %s 
             '''
             valueList = list()
             for article in toBeUpdateArticleList:
                 statistics = article.statistics
                 if isEventTable is False:
                     eventIdFieldValue = ''
                 else:
                     eventIdFieldValue = str(article.eventId) + ','
                 valueList.append(
                     '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s")' % (
                         article.tid,
                         eventIdFieldValue,
                         article.channel_id,
                         statistics.read_count if statistics.read_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.like_count if statistics.like_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.reply_count if statistics.reply_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.forward_count
                         if statistics.forward_count is not None else
                         Constants.DEFAULT_NUM,
                         statistics.collect_count
                         if statistics.collect_count is not None else
                         Constants.DEFAULT_NUM,
                         statistics.heat if statistics.heat is not None else
                         Constants.DEFAULT_NUM,
                         n,
                     ))
             if len(valueList) > 0:
                 self.dbProxy.execute(insertSql %
                                      (historyTableName, eventIdFieldName,
                                       ','.join(valueList)))
                 self.dbProxy.commit()
Beispiel #13
0
 def updateOldArticleToArticleTable(self,
                                    articleList,
                                    tableName,
                                    isEventTable=False):
     '''
     更新旧文章到文章表
     @param tableName: 全局文章表、实体文章表或者实体事件文章表
     @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None
     '''
     articleList = list(articleList)
     if len(articleList) > 0:
         if isEventTable is False:
             eventIdFieldName = ''
         else:
             eventIdFieldName = ',EVENT_ID'
         #找寻老文章
         selectSql = '''
         SELECT TID, CHANNEL_ID %s FROM %s where %s
         '''
         whereClauseList = map(
             lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
             (article.tid, article.channel_id), articleList)
         self.dbProxy.execute(
             selectSql %
             (eventIdFieldName, tableName, ' or '.join(whereClauseList)))
         resultList = self.dbProxy.fetchall()
         if isEventTable:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1], eventId=item[2]),
                 resultList)
         else:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1]), resultList)
         toBeUpdateArticleList = list()
         for item in existingArticleList:
             index = articleList.index(item)  # 返回查找对象的索引位置
             obj = copy.deepcopy(articleList[index])
             obj.eventId = item.eventId
             toBeUpdateArticleList.append(obj)
         if len(toBeUpdateArticleList) > 0:
             n = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             if isEventTable is False:
                 eventIdFieldName = ''
             else:
                 eventIdFieldName = 'EVENT_ID,'
             insertSql = '''
             INSERT INTO %s (TID, %s CHANNEL_ID,
                 READ_COUNT,LIKE_COUNT, REPLY_COUNT,
                 FORWARD_COUNT, COLLECT_COUNT, HEAT, UPDATE_DATETIME,PUBLISH_DATE)
             VALUES %s 
             ON DUPLICATE KEY UPDATE READ_COUNT=VALUES(READ_COUNT), LIKE_COUNT=VALUES(LIKE_COUNT), 
             REPLY_COUNT = VALUES(REPLY_COUNT), FORWARD_COUNT=VALUES(FORWARD_COUNT), 
             COLLECT_COUNT = VALUES(COLLECT_COUNT), HEAT = VALUES(HEAT), UPDATE_DATETIME=VALUES(UPDATE_DATETIME)
             '''
             valueList = list()
             for article in toBeUpdateArticleList:
                 statistics = article.statistics
                 data = article.publish_datetime
                 data = data.strftime('%Y-%m-%d')
                 if isEventTable is False:
                     eventIdFieldValue = ''
                 else:
                     eventIdFieldValue = str(article.eventId) + ','
                 valueList.append(
                     '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s","%s")' % (
                         article.tid,
                         eventIdFieldValue,
                         article.channel_id,
                         statistics.read_count if statistics.read_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.like_count if statistics.like_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.reply_count if statistics.reply_count
                         is not None else Constants.DEFAULT_NUM,
                         statistics.forward_count
                         if statistics.forward_count is not None else
                         Constants.DEFAULT_NUM,
                         statistics.collect_count
                         if statistics.collect_count is not None else
                         Constants.DEFAULT_NUM,
                         statistics.heat if statistics.heat is not None else
                         Constants.DEFAULT_NUM,
                         n,
                         data if data is not None else Constants.DATE,
                     ))
             if len(valueList) > 0:
                 sql = insertSql % (tableName, eventIdFieldName,
                                    ','.join(valueList))
                 self.dbProxy.execute(sql)
                 self.dbProxy.commit()