import content_per_article import time from database import Mysql connDB1 = Mysql.connDB() Mysql.exeSearch(connDB1[1]) ids = connDB1[1].fetchall() p = 0 n = 0 while p <= 8: #循环抓取每一页的内容,实际页数为篇p+1 s = 0 a = content_per_article.get_url_title(p) if a[0] != []: #判断是否此页的所有文章都在数据库中 while s < content_per_article.x: #循环抓取一页中每篇文章的内容 if (a[0][s], ) in ids or a[7][s] == '': #判断是否有文章已经存在数据库中或者要抓取的文章是否有内容 s += 1 continue else: sql = "INSERT INTO xungen(id,title,subtitle,summary,content,picurl,rid,author,create_time,public_time,update_time,isanonymous,content_imgs,city)VALUES" sql1 = sql + '("' + str(a[0][s]) + '","' + str( a[1][s] ) + '","' + str(a[2][s]) + '","' + str(a[3][s]) + '","' + str( a[4][s]) + '","' + str(a[5][s]) + '","' + str( a[6]) + '","' + str(a[7][s]) + '","' + str( a[8][s]) + '","' + str(a[9][s]) + '","' + str( a[10][s]) + '","' + str(a[11]) + '","' + str( a[12][s]) + '","' + str(a[13]) + '")' Mysql.exeUpdate(connDB1[0], connDB1[1], sql1) s += 1 n += 1
def get_url_title(p): global x #计算每一页有多少篇文章 x = 0 id = [] title = [] subtitle = [] summary = [] content = [] picurl = [] rid = '' author = [] create_time = [] public_time = [] update_time = [] isanonymous = '' content_imgs = [] city = '' Page_con = content_per_page.get_page(p) connDB1 = Mysql.connDB() Mysql.exeSearch(connDB1[1]) aids = connDB1[1].fetchall() for item in Page_con: res = requests.get(item['link']) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'lxml') if (item['aid'], ) in aids or soup.select( '#post-user') == []: #判断要插入的数据是否已经存在数据库中或者要抓取的内容是否存在 author.append('') time.sleep(1) continue else: id.append(item['aid']) title.append(item['title'].replace('"', '”')) subtitle = title summary.append(item['digest'].replace('"', '”')) content.append( pymysql.escape_string('\n'.join( str(s) for s in soup.select('p')[2:-6]))) picurl.append(item['cover']) rid = '0' author.append(soup.select('#post-user')[0].text) if soup.select('#post-date') == []: create_time.append(soup.select('#publish_time')[0].text) else: create_time.append(soup.select('#post-date')[0].text) public_time = create_time update_time.append(item['update_time']) isanonymous = 'No' n = 0 content_img = [] while n < len(soup.select('img')): if soup.select('img')[n].has_attr("data-src"): content_img.append(soup.select('img')[n]['data-src']) n += 1 else: n += 1 content_imgs.append(';'.join(content_img)) city = 'Macheng' x += 1 time.sleep(5) return id, title, subtitle, summary, content, picurl, rid, author, create_time, public_time, update_time, isanonymous, content_imgs, city