Exemple #1
0
    def InsData(self, url, newskind, DateDif):
        try:
            rep = requests.get(url, timeout=5)
            res = rep.content
        except:
            print 'time out'
            res = ''
        if res <> '':
            soup = bs(res, 'html.parser')
            soupdiv = soup.find(
                attrs={'class': ['ep-content-main', 'd_info clearfix']})

            titlediv = soupdiv.find(attrs={'id': 'h1title'})
            title = titlediv.text.strip()

            timediv = soupdiv.find(attrs={'class': 'ep-time-soure cDGray'})
            if timediv == None:
                timediv = soupdiv.find(attrs={'style': 'float:left;'})
            newstime = timediv.text.strip()
            newstime = newstime[0:19]
            newstime = datetime.datetime.strptime(newstime,
                                                  '%Y-%m-%d %H:%M:%S')

            sourcediv = soupdiv.find(attrs={'id': 'ne_article_source'})
            if sourcediv == None:
                source = '网易彩票'
            else:
                source = sourcediv.text.strip()

            string = ''
            for i in soupdiv('p'):
                string = string + i.text

            newstxt = string
            newscontent1 = '待解决'
            now = datetime.datetime.now() - datetime.timedelta(days=DateDif)
            currdate = newstime
            picname = ''
            picpath = ''
            txtpic = ''
            intro = ''

            #处理图片
            if (now.year == currdate.year and now.month == currdate.month
                    and now.day == currdate.day) or DateDif == -1:
                for c in soupdiv('p'):
                    txtpic = txtpic + c.text + '\n'
                    if c('img'):
                        for i in c('img'):
                            dir = 'D:/image/163/' + datetime.datetime.strftime(
                                newstime, '%Y%m%d') + '/'
                            #imgname= os.path.basename(i['src'])
                            src = re.findall("(.*.png|.*.jpg|.*.gif|.*.jpeg)",
                                             i['src'])
                            imgname = os.path.basename(src[0])
                            dirall = dir + imgname
                            urlpath = '/' + datetime.datetime.strftime(
                                newstime, '%Y%m%d') + '/' + imgname
                            if not os.path.exists(dir):
                                os.makedirs(dir)
                            urllib.urlretrieve(src[0], dirall, self.Schedule)
                            picname = picname + imgname + '|'
                            txtpic = '\n' + '%s[image=%s]' % (txtpic, urlpath)
                            picpath = dir

                txtpic = txtpic.strip()
                cmt = Func.GetComment('163', newskind, url)
                conn = MySQLdb.connect(host='.',
                                       port=3306,
                                       user='******',
                                       passwd='123456',
                                       db='news_info',
                                       charset='utf8')
                cur = conn.cursor()
                #insert数据

                dictypename = {1: '彩市新闻', 2: '数字大奖'}
                sql = 'insert into tbl_news_info (title,content,pubtime,newstype,source,typename,webname,newsurl,contenttxt,intro,picpath,picname,txtpic,joincount) select \'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',%d from DUAL where not exists (select 1 from tbl_news_info where title = \'%s\' limit 1)' % (
                    title, newscontent1, newstime, '其它', source,
                    dictypename[newskind], '网易', url, newstxt, intro, picpath,
                    picname, txtpic, cmt, title)
                try:
                    cur.execute(sql)
                    cur.close()
                    conn.commit()
                    conn.close()
                except Exception, e:
                    print 'str(Exception):\t', str(Exception)
                    print 'str(e):\t\t', str(e)
                    print 'repr(e):\t', repr(e)
                    print 'e.message:\t', e.message
                    print 'traceback.print_exc():'
                    traceback.print_exc()
                    print 'traceback.format_exc():\n%s' % traceback.format_exc(
                    )
                    print 'url:%s' % url
                    print '########################################################'
                finally:
                    pass
Exemple #2
0
    def InsData(self, url, newskind, DateDif):
        try:
            rep = requests.get(url, timeout=5)
        except:
            rep = ''
            print 'time out'

        if rep <> '':
            rep = requests.get(url, timeout=10)
            res = rep.content
            soup = bs(res, 'html.parser')

            soupcontent = soup.find(attrs={'class': 'articleContent'})

            souptitle = soup.find(attrs={'class': 'articleTitle'})
            title = souptitle.text

            soupinfo = soup.find(attrs={'class': 'aInfo'})
            infolist = soupinfo.text.split('  ')
            while '' in infolist:
                infolist.remove('')

            author = infolist[0]

            newstime = infolist[1].replace('发表于:', '')

            localpath = 'D:/image/win310/%s/%s/%s/' % (
                newstime[0:4], newstime[5:7], newstime[8:10])

            source = infolist[2].replace('来源:', '')

            soupintro = soup.find(attrs={'class': 'aBrief'})
            newsintro = soupintro.text.strip()

            content = soupcontent

            newscon = ''

            newstxt = content.text.strip().replace('\n', '')

            now = datetime.datetime.now() - datetime.timedelta(days=DateDif)
            currdate = datetime.datetime.strptime(newstime, '%Y-%m-%d %H:%M')
            picname = ''
            picpath = ''
            txtpic = ''

            if (now.year == currdate.year and now.month == currdate.month
                    and now.day == currdate.day) or DateDif == -1:
                for con in content('p'):
                    txtpic = txtpic + con.text.strip() + '\n'
                    if con('img'):
                        img = con('img')
                        for i in img:
                            picname = os.path.basename(i['src'])
                            localpathall = localpath + picname

                            urlpath = localpathall.replace(
                                'D:/image/win310', '')
                            downurl = i['src']
                            downurl = 'http://www.310win.com' + downurl.replace(
                                'http://www.310win.com', '')

                            if not os.path.exists(localpath):
                                os.makedirs(localpath)
                            urllib.urlretrieve(downurl, localpathall,
                                               self.Schedule)
                        txtpic = '%s[image=%s]' % (txtpic.strip(),
                                                   urlpath) + '\n'

                cmt = Func.GetComment('310win', newskind, url)
                picpath = localpath
                conn = MySQLdb.connect(host='.',
                                       port=3306,
                                       user='******',
                                       passwd='123456',
                                       db='news_info',
                                       charset='utf8')
                cur = conn.cursor()
                #insert数据

                dictypename = {
                    1: '双色球',
                    2: '大乐透',
                    3: '七星彩',
                    4: '福彩3d',
                    5: '排列3排列5'
                }
                sql = 'insert into tbl_news_info (title,content,pubtime,newstype,source,typename,webname,newsurl,contenttxt,intro,picpath,picname,txtpic,joincount,author) select \'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',%d,\'%s\' from DUAL where not exists(select 1 from tbl_news_info where title = \'%s\' limit 1)' % (
                    title, newscon, newstime, '', source,
                    dictypename[newskind], '彩客网', url, newstxt, newsintro,
                    picpath, picname, txtpic, cmt, author, title)
                try:
                    cur.execute(sql)
                    cur.close()
                    conn.commit()
                    conn.close()
                except Exception, e:
                    print 'str(Exception):\t', str(Exception)
                    print 'str(e):\t\t', str(e)
                    print 'repr(e):\t', repr(e)
                    print 'e.message:\t', e.message
                    print 'traceback.print_exc():'
                    traceback.print_exc()
                    print 'traceback.format_exc():\n%s' % traceback.format_exc(
                    )
                    print 'url:%s' % url
                    print '########################################################'
                finally:
                    pass