def InsData(self, url, newskind, DateDif): try: rep = requests.get(url, timeout=5) res = rep.content except: print 'time out' res = '' if res <> '': soup = bs(res, 'html.parser') soupdiv = soup.find( attrs={'class': ['ep-content-main', 'd_info clearfix']}) titlediv = soupdiv.find(attrs={'id': 'h1title'}) title = titlediv.text.strip() timediv = soupdiv.find(attrs={'class': 'ep-time-soure cDGray'}) if timediv == None: timediv = soupdiv.find(attrs={'style': 'float:left;'}) newstime = timediv.text.strip() newstime = newstime[0:19] newstime = datetime.datetime.strptime(newstime, '%Y-%m-%d %H:%M:%S') sourcediv = soupdiv.find(attrs={'id': 'ne_article_source'}) if sourcediv == None: source = '网易彩票' else: source = sourcediv.text.strip() string = '' for i in soupdiv('p'): string = string + i.text newstxt = string newscontent1 = '待解决' now = datetime.datetime.now() - datetime.timedelta(days=DateDif) currdate = newstime picname = '' picpath = '' txtpic = '' intro = '' #处理图片 if (now.year == currdate.year and now.month == currdate.month and now.day == currdate.day) or DateDif == -1: for c in soupdiv('p'): txtpic = txtpic + c.text + '\n' if c('img'): for i in c('img'): dir = 'D:/image/163/' + datetime.datetime.strftime( newstime, '%Y%m%d') + '/' #imgname= os.path.basename(i['src']) src = re.findall("(.*.png|.*.jpg|.*.gif|.*.jpeg)", i['src']) imgname = os.path.basename(src[0]) dirall = dir + imgname urlpath = '/' + datetime.datetime.strftime( newstime, '%Y%m%d') + '/' + imgname if not os.path.exists(dir): os.makedirs(dir) urllib.urlretrieve(src[0], dirall, self.Schedule) picname = picname + imgname + '|' txtpic = '\n' + '%s[image=%s]' % (txtpic, urlpath) picpath = dir txtpic = txtpic.strip() cmt = Func.GetComment('163', newskind, url) conn = MySQLdb.connect(host='.', port=3306, user='******', passwd='123456', db='news_info', charset='utf8') cur = conn.cursor() #insert数据 dictypename = {1: '彩市新闻', 2: '数字大奖'} sql = 'insert into tbl_news_info (title,content,pubtime,newstype,source,typename,webname,newsurl,contenttxt,intro,picpath,picname,txtpic,joincount) select \'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',%d from DUAL where not exists (select 1 from tbl_news_info where title = \'%s\' limit 1)' % ( title, newscontent1, newstime, '其它', source, dictypename[newskind], '网易', url, newstxt, intro, picpath, picname, txtpic, cmt, title) try: cur.execute(sql) cur.close() conn.commit() conn.close() except Exception, e: print 'str(Exception):\t', str(Exception) print 'str(e):\t\t', str(e) print 'repr(e):\t', repr(e) print 'e.message:\t', e.message print 'traceback.print_exc():' traceback.print_exc() print 'traceback.format_exc():\n%s' % traceback.format_exc( ) print 'url:%s' % url print '########################################################' finally: pass
def InsData(self, url, newskind, DateDif): try: rep = requests.get(url, timeout=5) except: rep = '' print 'time out' if rep <> '': rep = requests.get(url, timeout=10) res = rep.content soup = bs(res, 'html.parser') soupcontent = soup.find(attrs={'class': 'articleContent'}) souptitle = soup.find(attrs={'class': 'articleTitle'}) title = souptitle.text soupinfo = soup.find(attrs={'class': 'aInfo'}) infolist = soupinfo.text.split(' ') while '' in infolist: infolist.remove('') author = infolist[0] newstime = infolist[1].replace('发表于:', '') localpath = 'D:/image/win310/%s/%s/%s/' % ( newstime[0:4], newstime[5:7], newstime[8:10]) source = infolist[2].replace('来源:', '') soupintro = soup.find(attrs={'class': 'aBrief'}) newsintro = soupintro.text.strip() content = soupcontent newscon = '' newstxt = content.text.strip().replace('\n', '') now = datetime.datetime.now() - datetime.timedelta(days=DateDif) currdate = datetime.datetime.strptime(newstime, '%Y-%m-%d %H:%M') picname = '' picpath = '' txtpic = '' if (now.year == currdate.year and now.month == currdate.month and now.day == currdate.day) or DateDif == -1: for con in content('p'): txtpic = txtpic + con.text.strip() + '\n' if con('img'): img = con('img') for i in img: picname = os.path.basename(i['src']) localpathall = localpath + picname urlpath = localpathall.replace( 'D:/image/win310', '') downurl = i['src'] downurl = 'http://www.310win.com' + downurl.replace( 'http://www.310win.com', '') if not os.path.exists(localpath): os.makedirs(localpath) urllib.urlretrieve(downurl, localpathall, self.Schedule) txtpic = '%s[image=%s]' % (txtpic.strip(), urlpath) + '\n' cmt = Func.GetComment('310win', newskind, url) picpath = localpath conn = MySQLdb.connect(host='.', port=3306, user='******', passwd='123456', db='news_info', charset='utf8') cur = conn.cursor() #insert数据 dictypename = { 1: '双色球', 2: '大乐透', 3: '七星彩', 4: '福彩3d', 5: '排列3排列5' } sql = 'insert into tbl_news_info (title,content,pubtime,newstype,source,typename,webname,newsurl,contenttxt,intro,picpath,picname,txtpic,joincount,author) select \'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',%d,\'%s\' from DUAL where not exists(select 1 from tbl_news_info where title = \'%s\' limit 1)' % ( title, newscon, newstime, '', source, dictypename[newskind], '彩客网', url, newstxt, newsintro, picpath, picname, txtpic, cmt, author, title) try: cur.execute(sql) cur.close() conn.commit() conn.close() except Exception, e: print 'str(Exception):\t', str(Exception) print 'str(e):\t\t', str(e) print 'repr(e):\t', repr(e) print 'e.message:\t', e.message print 'traceback.print_exc():' traceback.print_exc() print 'traceback.format_exc():\n%s' % traceback.format_exc( ) print 'url:%s' % url print '########################################################' finally: pass