def fetchOldArticleList(self, channel, articleList, articleCount=100): ''' 从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里 ''' channel = int(channel) # 用来查询总页数 selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d ' sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE, channel) # 获取旧文章的sql selectSql = 'SELECT TID,title, publish_datetime,url, meta_info,like_count,reply_count,forward_count FROM %s where extinct="N" and channel_id=%d ' sql = selectSql % (Constants.TABLE_SA_ARTICLE, channel) if len(articleList) > 0: whereClauseList = map( lambda article: ' tid<>"%s" ' % (article.tid), articleList) whereClauseList = ' and '.join(whereClauseList) sql += ' and (%s)' % (whereClauseList) sql2 += ' and (%s)' % (whereClauseList) sql2 += ' order by add_datetime desc;' self.dbProxy.execute(sql2) resultList2 = self.dbProxy.fetchall() # print '12456789sssssssssssssssssss' # print resultList2 #((53,),) resultList2 = re.findall(r'\d+', str(resultList2)) # 返回一个list # print resultList2[0] if int(resultList2[0]) > int(articleCount): randpage = random.randint( 0, int(math.ceil(float(resultList2[0]) / articleCount))) else: randpage = 0 # 用来随机取数据库页数 sql += ' order by add_datetime desc limit %d,%d' % (randpage, articleCount) self.dbProxy.execute(sql) resultList = self.dbProxy.fetchall() L1 = [] for item in resultList: result = Article(item[0], channel, title=item[1], publish_datetime=item[2], url=item[3], meta_info=item[4]) result.statistics = ArticleStatistics(item[0], channel, like_count=item[5], reply_count=item[6], forward_count=item[7]) L1.append(result) return L1
def parse_main(self, response): item = response.meta['item'] item['article'] = response.xpath( "//div[@class ='p-right left']//div[@id='p-detail']//p|" "//div[@id='content']//p|" "//div[@class='content']//p|" "//div[@class ='contant clearfix']/div[@class ='xl']//p|" "//div[@id ='Content']//p|" "//div[@class ='zj_left']/div[@class ='zj_nr']//p|" "//td[@class='text_con_16_33']//p|" "//div[@class ='content pack']//p|" "//div[@class = 'article']//p|" "//div[@class ='main-content-box']//p|" "//div[@id ='nr_wz']//p").xpath('string(.)').extract() item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4] yield item article = Article(tid=item['TID'], channel_id=11, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.r) == len(self.R): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)
def analysisBefore(self, event_id): entityEventDict = dict({'event_id': event_id}) if entityEventDict is None or self.entity_id not in entityEventDict: self.logger.warn('Entity %d not found in system', self.entity_id) return [] eventDict = entityEventDict[self.entity_id] articleTableName = Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + self.entity_id eventTableName = Constants.TABLE_SA_EVENT + Constants.TABLE_NAME_DELIMITER + self.entity_id sqlArticlListBefore = ''' SELECT a.TID,a.CHANNEL_ID, a.TITLE, a.CONTENT, a.PUBLISH_DATETIME, a.URL, a.AUTHOR_ID, a.AUTHOR_NAME, a.PUBLISH_METHOD, a.DIGEST, a.HEAT, FROM %s as a, %s as e WHERE a.PUBLISH_DATETIME > e.START_DATETIME ''' % (articleTableName, eventTableName) self.dbProxy.execute(sqlArticlListBefore) resultList = self.dbProxy.fetchall() articleList = map( lambda item: Article(item[0], item[1], item[2], item[3], item[ 4], item[5], item[6], item[7], item[8], item[9], item[10]), resultList) hitEventList = list() for article in articleList: hitEventList.append(self.__analysisEvent(article, eventDict)) return hitEventList
def __fetchAllArticleList(self, entity_id, start_time, end_time): ''' 获取数据库中某段时间添加的文章 ''' selectSql = '''select * from %s where add_datetime between '%s' and '%s' ''' tableName = Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + entity_id sql = selectSql % (tableName, start_time, end_time) self.dbProxy.execute(sql) results = self.dbProxy.fetchall() self.logger.debug(len(results)) articleList = list() for item in results: article = Article(tid=item[0], url=item[1], add_datetime=item[2], publish_datetime=item[3], publish_method=item[4], title=item[7], author_id=item[8], author_name=item[9], content=item[11], heat=item[17], channel_id=item[18], entity=entity_id) article.statistics.read_count = item[12] article.statistics.like_count = item[13] article.statistics.reply_count = item[14] article.statistics.forward_count = item[15] article.statistics.collect_count = item[16] articleList.append(article) return articleList
def parse_main(self, response): item = RMWspider1Item() item['title'] = response.meta['title'][0] item['time'] = response.meta['time'] item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace( ']', '', ) item['href'] = response.meta['href'] item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5] if 'people' in item['TID']: item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:] item['source'] = response.xpath( "//div[@class = 'artOri']/a/text()|" "//div[@class='box01']//a/text()|" "//div[@class='text_c']/p//a/text()|" "//div[@class = 'msgBox']//a/text()|" "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|" "//div[@class = 'w1000 p2']//a/text()|" "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first() item['article'] = response.xpath( "//div[@id='rwb_zw']//p|" "//div[@class='show_text']//p|" "//div[@class='artDet']//p|" "//div[@class='text_con clearfix']//p|" "//div[@class = 'content clear clearfix']//p|" "//div[@id = 'p_content']//p|" "//div[@class = 'box_con']//p|" "//div[@class = 'text_show']//p|" "//div[@class = 'gray box_text']//p|" "//div[@class = 'text_box clearfix']//p").xpath( 'string(.)').extract() yield item article = Article(tid=item['TID'], channel_id=5, title=item['title'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.r.append(article) if len(self.R) == len(self.r): print(len(self.r)) print('爬虫结束,开始热度分析') SARunner().article_List(self.r)
def filterRemovedArticle(self, articleList, entityId, eventId=None): ''' 与remove表格对比,进行文章过滤 返回不存在remove表中的文章list ''' if len(articleList) == 0: return [] if eventId is not None: tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = ' event_id=%d and ' % eventId start_datetime, end_datetime = self.fetchEventTime( entityId, eventId) # 过滤掉不在该事件开始时间和结束之间内的文章 article_new_list = list() for article in articleList: if (str(article.publish_datetime) > str(start_datetime)) and ( str(article.publish_datetime) < str(end_datetime)): article_new_list.append(article) articleList = article_new_list else: tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = '' # 在remove表里查找文章 selectSql = ''' SELECT TID, CHANNEL_ID FROM %s where %s (%s) ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute( selectSql % (tableName, eventCondition, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() # 查询返回结果集 removedArticleList = map(lambda x: Article(x[0], x[1]), resultList) filteredArticle = filter(lambda x: x not in removedArticleList, articleList) return filteredArticle
def seperateNewOldArticles(self, articleList, entityId=None): ''' 查询全局文章表,区分新文章和旧文章 ''' if len(articleList) == 0: return ([], []) if entityId is None: selectSql = 'select tid, channel_id from %s where ' % Constants.TABLE_SA_ARTICLE else: selectSql = 'select tid, channel_id from %s where ' % ( Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + entityId) whereClauseList = map( lambda article: '(tid="%s" and channel_id=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute(selectSql + ' or '.join(whereClauseList)) resultList = map(lambda x: Article(x[0], x[1]), self.dbProxy.fetchall()) existingArticleList = filter(lambda x: x in resultList, articleList) newArticleList = filter(lambda x: x not in resultList, articleList) return (existingArticleList, newArticleList)
def crawlNewsArticle(self, url): ''' 爬取url前缀为news.qq.com和gd.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('news.qq.com') < 0 and article_url.find( 'gd.qq.com') < 0: self.logger.warn('Unrelated url found:%s', url) return None article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNews]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'id': "Main-Article-QQ"}) main1 = soup.find('div', attrs={'id': "Main-P-QQ"}) if main is not None: Ttitle = main.find('h1').text.strip() #标题 Ttime = main.find('span', attrs={'class': "article-time"}) #发布时间 Ttime1 = main.find('span', attrs={'class': "a_time"}) Ttime2 = main.find('span', attrs={'class': "pubTime"}) if Ttime is not None: Ttime = Ttime.text.strip() elif Ttime1 is not None: Ttime1 = Ttime1.text.strip() Ttime = Ttime1 elif Ttime2 is not None: Ttime2 = Ttime2.text.strip() Ttime = Ttime2 else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = main.find('span', attrs={'class': "a_source"}) Tauthor1 = main.find('span', attrs={'class': "color-a-1"}) if Tauthor is not None: #Tauthor = Tauthor.find('a').text.strip() Tauthor = Tauthor.text.strip() elif Tauthor1 is not None: #Tauthor1 = Tauthor1.find('a').text.strip() Tauthor1 = Tauthor1.text.strip() Tauthor = Tauthor1 else: Tauthor = None Tcontent = main.find('div', attrs={'id': "Cnt-Main-Article-QQ"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) if commentid is not None: try: re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum' html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=None, timeout=10, retry=3)) Treply = int(html1['data']['commentnum']) except Exception: traceInfo = traceback.format_exc() self.logger.error( 'Faile to parse comment for %s (cid=%s):%s', articleid, commentid, traceInfo) Treply = None article.statistics.reply_count = Treply return article elif main1 is not None: Ttitle = soup.find('meta', attrs={ 'name': "Description" }).attrs['content'] # 标题 Ttime = re.findall( r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',", html['html']) if Ttime is not None: Ttime = Ttime[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[ 2] + ' ' + Ttime[3] else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = re.findall(r'para = {\s+name: \"(.*)\",', html['html']) if Tauthor is not None: Tauthor = Tauthor[0] else: Tauthor = None con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url) con_html = self.session.download(con_url, encoding='gbk', data=None, timeout=10, retry=3) con_list = re.findall(r'<p>(.*?)</p>', con_html) if con_list is not None: TT = [] for i in con_list: if i.strip() not in TT: TT.append(i) Tcontent = ''.join(TT) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) try: if commentid is not None: re_url = 'http://coral.qq.com/article/batchcommentnum' data1 = {'targetid': articleid} html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=data1, timeout=10, retry=3)) Treply = int(html1['data'][0]['commentnum']) else: Treply = None except: Treply = None article.statistics.reply_count = Treply return article return None
def crawlNewArticle(self, url): ''' 爬取url前缀为new.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('new.qq.com/omn') < 0: self.logger.warn('Unrelated url found:%s', url) return article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNew]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') script_tags = soup.head.find_all('script') data = dict() for tag in script_tags: text = re.search(r'window.DATA = (.*)', tag.text, re.S) if text: data = json.loads(text.group(1)) tid = data['article_id'] title = data['title'] author_name = data['media'] author_id = data['media_id'] publish_datetime = data['pubtime'] comment_id = data['comment_id'] main = soup.find('div', attrs={'class': 'qq_conent clearfix'}) t_content = '' if main is not None: contents = main.find_all('p', {'class': 'one-p'}) for content in contents: if content.string is None: continue t_content += str(content.get_text().strip()) get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % ( comment_id, comment_id) comment_data = self.session.download(get_comment_count_url) comment_data = re.search( r'_article%scommentnum\((.*)\)' % comment_id, comment_data) comment_dict = eval(comment_data.group(1)) reply_count = comment_dict['data']['commentnum'] meta_info = '{"commentid":"%s"}' % comment_id article = Article(tid=tid, channel_id=self.channel.channel_id, title=title, content=t_content, publish_datetime=publish_datetime, url=article_url, author_id=author_id, author_name=author_name, meta_info=meta_info) article.statistics.reply_count = reply_count return article return None
def parse_info(self, response): weibo_list = response.xpath("//div[@class='c' and @id]") for weibo in weibo_list: item = Weibospider1Item() div = weibo.xpath("./div") if len(div) == 1: # 微博类型 item["category"] = "无图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item["content"] = weibo.xpath( "./div/span[@class='ctt']").xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div/text()|./div/span[@class='ctt']//text()" ).extract() item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div/span[@class='ct']/text()").extract_first() item["reason"] = None item["img_url"] = None item['reason_name'] = None item['reason_id'] = None elif len(div) == 2: item["category"] = "" item["content"] = weibo.xpath("./div[1]/span[@class='ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[2]/a[@class='cc']/text()").extract_first() item["reason"] = None img = weibo.xpath("./div[2]//img[@class='ib']/@src") if len(img) == 0: # 无图转发 item['category'] = "无图转发" item["author"] = weibo.xpath( "./div/span[@class = 'cmt']/a/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()" ).extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first( ) item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["reason"] = weibo.xpath( "./div[2]/text()|./div[2]//span[@class='kt']/text()" ).extract() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["img_url"] = None item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # 有图原创 item['category'] = "有图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = None item['reason_id'] = None item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # len(div) == 3 item["category"] = "带图片转发" item["author"] = weibo.xpath( "./div[1]/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()").extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first() item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["send_time"] = weibo.xpath( "./div[3]/span[@class='ct']/text()").extract_first() item["dianzan"] = weibo.xpath( "./div[3]/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[3]/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div[3]/a[@class='cc']/@href").extract_first() item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["reason"] = weibo.xpath( "./div[3]/text()|./div[3]//span[@class='kt']/text()" ).extract() item['relay_url'] = '' item['TID'] = re.findall(r'uid=.{1,}&', item["comment_url"])[0][4:-1] a = weibo.xpath("//a[@class='nk']/@href").extract() yield item article = Article(tid=item['TID'], channel_id=9, content=item['content'], publish_datetime=item['send_time'], url=item['comment_url'], title=item['content'][0:100], author_id=item['author_id'], author_name=item['author']) article.statistics = ArticleStatistics( tid=item['TID'], channel_id=9, reply_count=item['comment'], forward_count=item['relay'], like_count=item['dianzan'], ) if int(item['relay']) > 0: self.relay_url_list.append(item['relay_url']) self.r.append(article) self.name_url_list.append(a) num_page = response.xpath( "//div[@id='pagelist']/form/div/text()").extract() num_page = [i.replace( u"\xa0", "", ) for i in num_page] num_page = [i for i in num_page if len(i) > 0][0] num_page = re.findall(r'\d+', num_page) print('正在爬取第', num_page[0], '页', num_page[1]) max_page = NUM_PAGE if max_page is None: max_page = int(num_page[1]) if int(num_page[0]) == max_page: L = [] for L1 in self.name_url_list: L += L1 for url_1 in L: with open(os_file.a + '\\crawler_url.txt', 'a', encoding='utf-8') as f: f.write(url_1 + "\n") print('页数上限,搜索页数据爬取完毕') print('爬虫结束,开始热度分析') SARunner().article_List(self.r) print("爬取微博数:", len(self.r)) # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') # 爬取作者头像 id 关注 粉丝 with open(os_file.a + '\\crawler_url.txt', 'r', encoding='utf-8') as f: urls = f.readlines() # 获取待爬个数 # 去重 L2 = {}.fromkeys(urls).keys() self.L2 = len(L2) print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') for url in L2: yield scrapy.FormRequest(url=url, callback=self.parse_info_detail, dont_filter=True) else: next_url = response.xpath( "//a[text() = '下页']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) yield scrapy.Request(next_url, callback=self.parse_info, dont_filter=True)
def parse_main(self, response): item = XinLangspider1Item() item['intro'] = str(response.meta["intro"]).replace( u"...", "", ).replace( u"']", "", ).replace( u"['", "", ) item['href'] = response.meta["href"] item['time'] = response.meta['time'] item['title_main'] = response.meta['title'] item['article'] = response.xpath( "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()" ).extract() item['source'] = response.xpath( "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()" ).extract() item['TID'] = None a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5] a = a.replace( u"/", "", ) if a in 'k': item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1] else: item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6] if a in xw_type.cs: item['source'] = response.xpath( "//span[@id = 'art_source']/text()").extract() item['article'] = response.xpath( "//div[@class = 'article-body main-body']//p//text()").extract( ) elif a in xw_type.ss: item['source'] = response.xpath( "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()" ).extract() elif a in xw_type.xw: item['article'] = response.xpath("//div[@id = 'article']").xpath( 'string(.)').extract() item['source'] = response.xpath( "//a[@class = 'source']/text()").extract() elif a in xw_type.bk: item['source'] = '新浪博客' item['article'] = response.xpath( "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font" ).xpath('string(.)').extract() # 手机版网站 if len(item['article']) == 0 and len(item['source']) == 0: item['article'] = response.xpath( "//section[@class = 'art_pic_card art_content']/p//text()" ).extract() item['source'] = response.xpath( "//h2[@class ='weibo_user']/text()").extract() yield item article = Article(tid=item['TID'], channel_id=3, title=item['title_main'], content=item['article'], publish_datetime=item['time'], url=item['href'], author_name=item['source'], digest=item['intro']) self.R.append(article) if len(self.r) == len(self.R): print(len(self.R)) print('开始保存数据库') print('爬虫结束,开始热度分析') SARunner().article_List(self.R)
def updateOldArticleToArticleHistoryTable(self, articleList, currentTableName, historyTableName, isEventTable=False): ''' 更新到文章历史表 @param currentTableName: 当前文章表:全局文章表、实体文章表或者实体事件文章表 @param historyTableName: 历史文章表:全局文章表、实体文章表或者实体事件文章表 @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None ''' articleList = list(articleList) if len(articleList) > 0: if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = ',EVENT_ID' #找寻老文章 selectSql = ''' SELECT TID, CHANNEL_ID %s FROM %s where %s ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute(selectSql % (eventIdFieldName, currentTableName, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() if isEventTable: existingArticleList = map( lambda item: Article(item[0], item[1], eventId=item[2]), resultList) else: existingArticleList = map( lambda item: Article(item[0], item[1]), resultList) toBeUpdateArticleList = list() for item in existingArticleList: index = articleList.index(item) obj = copy.copy(articleList[index]) obj.eventId = item.eventId toBeUpdateArticleList.append(obj) if len(toBeUpdateArticleList) > 0: n = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = 'EVENT_ID,' insertSql = ''' INSERT INTO %s (TID, %s CHANNEL_ID, READ_COUNT,LIKE_COUNT, REPLY_COUNT, FORWARD_COUNT, COLLECT_COUNT, HEAT, ADD_DATETIME) VALUES %s ''' valueList = list() for article in toBeUpdateArticleList: statistics = article.statistics if isEventTable is False: eventIdFieldValue = '' else: eventIdFieldValue = str(article.eventId) + ',' valueList.append( '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s")' % ( article.tid, eventIdFieldValue, article.channel_id, statistics.read_count if statistics.read_count is not None else Constants.DEFAULT_NUM, statistics.like_count if statistics.like_count is not None else Constants.DEFAULT_NUM, statistics.reply_count if statistics.reply_count is not None else Constants.DEFAULT_NUM, statistics.forward_count if statistics.forward_count is not None else Constants.DEFAULT_NUM, statistics.collect_count if statistics.collect_count is not None else Constants.DEFAULT_NUM, statistics.heat if statistics.heat is not None else Constants.DEFAULT_NUM, n, )) if len(valueList) > 0: self.dbProxy.execute(insertSql % (historyTableName, eventIdFieldName, ','.join(valueList))) self.dbProxy.commit()
def updateOldArticleToArticleTable(self, articleList, tableName, isEventTable=False): ''' 更新旧文章到文章表 @param tableName: 全局文章表、实体文章表或者实体事件文章表 @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None ''' articleList = list(articleList) if len(articleList) > 0: if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = ',EVENT_ID' #找寻老文章 selectSql = ''' SELECT TID, CHANNEL_ID %s FROM %s where %s ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute( selectSql % (eventIdFieldName, tableName, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() if isEventTable: existingArticleList = map( lambda item: Article(item[0], item[1], eventId=item[2]), resultList) else: existingArticleList = map( lambda item: Article(item[0], item[1]), resultList) toBeUpdateArticleList = list() for item in existingArticleList: index = articleList.index(item) # 返回查找对象的索引位置 obj = copy.deepcopy(articleList[index]) obj.eventId = item.eventId toBeUpdateArticleList.append(obj) if len(toBeUpdateArticleList) > 0: n = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = 'EVENT_ID,' insertSql = ''' INSERT INTO %s (TID, %s CHANNEL_ID, READ_COUNT,LIKE_COUNT, REPLY_COUNT, FORWARD_COUNT, COLLECT_COUNT, HEAT, UPDATE_DATETIME,PUBLISH_DATE) VALUES %s ON DUPLICATE KEY UPDATE READ_COUNT=VALUES(READ_COUNT), LIKE_COUNT=VALUES(LIKE_COUNT), REPLY_COUNT = VALUES(REPLY_COUNT), FORWARD_COUNT=VALUES(FORWARD_COUNT), COLLECT_COUNT = VALUES(COLLECT_COUNT), HEAT = VALUES(HEAT), UPDATE_DATETIME=VALUES(UPDATE_DATETIME) ''' valueList = list() for article in toBeUpdateArticleList: statistics = article.statistics data = article.publish_datetime data = data.strftime('%Y-%m-%d') if isEventTable is False: eventIdFieldValue = '' else: eventIdFieldValue = str(article.eventId) + ',' valueList.append( '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s","%s")' % ( article.tid, eventIdFieldValue, article.channel_id, statistics.read_count if statistics.read_count is not None else Constants.DEFAULT_NUM, statistics.like_count if statistics.like_count is not None else Constants.DEFAULT_NUM, statistics.reply_count if statistics.reply_count is not None else Constants.DEFAULT_NUM, statistics.forward_count if statistics.forward_count is not None else Constants.DEFAULT_NUM, statistics.collect_count if statistics.collect_count is not None else Constants.DEFAULT_NUM, statistics.heat if statistics.heat is not None else Constants.DEFAULT_NUM, n, data if data is not None else Constants.DATE, )) if len(valueList) > 0: sql = insertSql % (tableName, eventIdFieldName, ','.join(valueList)) self.dbProxy.execute(sql) self.dbProxy.commit()