Python DB Examples

Programming Language: Python

Namespace/Package Name: DBcontrol

Class/Type: DB

Examples at hotexamples.com: 30

Python DB - 30 examples found. These are the top rated real world Python examples of DBcontrol.DB extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DB(21)

__query__(9)

deleteUrl(4)

__randomHandT__(2)

__randomP__(1)

classifyDB(1)

classifyDBComment(1)

Example #1

Show file

    def __startToMix__(self):  #每次执行生成一篇
        datahelper = DB()
        allHtml = datahelper.__randomP__()

        title,header,tail,id = datahelper.__randomHandT__()  #没有找到id就变成0了

        # print(title,header,tail)
        #title = "<h1>"+title+"</h1>"
        # print(header)
        # print("合成的html文件在这儿")
        mixP  = title+header+allHtml+tail
        # print("本文总长"+str(len(new_news)))
        # print(title+header+allHtml+tail)

        # f= open(r"D:\pyfile\ProxySpider\newMission\new.html","w")
        # f.write(new_news)  ##多媒体文件要是用conctent哦！
        # f.close()/
        # print("生成的标题是  "+title  )
        # print("生成的内容是 "+mixP)

        if(title!="" and mixP !=""):
            datahelper.insertMixP(title,mixP)  #生成成功的话就是这样
            datahelper.updateMixState(id)  #这儿无法更新的  ,这儿自带更新了
            # print("生成成功")
            return True

        else:
            print("生成混合失败，没有找到title不为空的url")
            # datahelper.deleteUrl()
            return False

Example #2

Show file

    def job(self, name):  #这个是主线程把
        dbhelper = DB()  #todo 每个线程的异常处理需要单独的进行，后续可能需要主线程来管理子线程的异常梳理才可以
        print("正在爬取今天的新闻内容")
        print('这里是进程: %sd   父进程ID：%s' % (os.getpid(), os.getppid()))
        p1 = multiprocessing.Process(target=worker_1, args=(6, ))
        p2 = multiprocessing.Process(target=worker_2, args=(3, ))  #腾讯
        p3 = multiprocessing.Process(target=worker_3, args=(4, ))

        p1.daemon = True
        p2.daemon = True
        p3.daemon = True
        #
        p1.start()
        p2.start()
        p3.start()
        print("The number of CPU is:" + str(multiprocessing.cpu_count()))
        for p in multiprocessing.active_children():
            print("child   p.name:" + p.name + "\tp.id" + str(p.pid))

        p1.join()
        p2.join()
        p3.join()  # 直接不等待吧

        # print("The number of CPU is:" + str(multiprocessing.cpu_count()))  #结束了后就无法读取进程号了，使用了join的话
        # for p in multiprocessing.active_children():
        #     print("child   p.name:" + p.name + "\tp.id" + str(p.pid))
        print(
            "today work done ND!!!!!!!!!!!!!!!!!")  # 这是是主线程，如何让主线程等待子线程结束后才输出呢
        print("all over !")
        print("正在去重。。。")
        dbhelper.quchong()  # 执行去重的东西
        print("正在等待明天的到来，")
        dbhelper.getAllTitle()

Example #3

Show file

def wangyiFill(wangyiUrls):
    datehelper = DB()
    flag = 1
    for url in wangyiUrls:
        flag += 1
        if flag % 50 == 0:  #每30个休息一下子这个样子，
            time.sleep(60 * 3)
            datehelper.refreshConnection()
        title, Hcontent, Tcontent, Acontent = fillWangyi.getPageContent(
            url[0])  #但是如果是title=''这种就会遇到那种可能是格式不支持的那种
        if (title != "" and Hcontent != ""):
            datehelper.updateContent(url[0], title, Hcontent, Tcontent,
                                     Acontent)  #这个打开更新东西进去是需要title不等于空的才可以
            mixNews = MixNews()
            state = mixNews.__startToMix__()
            if (state):  # 里面已经有那种写入数据库的操作了
                datehelper.updateState(url[0])

Example #4

Show file

def fenghuangFill(tengxunUrls):
    datehelper = DB()
    flag = 1
    for url in tengxunUrls:
        flag += 1
        if flag % 50 == 0:  #每30个休息一下子这个样子，打开多少个页面后休息一下
            time.sleep(60 * 3)  #每次休眠后都重来一个新的数据库连接
            datehelper.refreshConnection()

        title, Hcontent, Tcontent, Acontent = fillFenghaung.getPageContent(
            url[0])
        if (title != "" and Hcontent != ""):
            datehelper.updateContent(url[0], title, Hcontent, Tcontent,
                                     Acontent)
            mixNews = MixNews()
            state = mixNews.__startToMix__()
            if (state):  # 里面已经有那种写入数据库的操作了
                datehelper.updateState(url[0])

Example #5

Show file

def tengxunFill(tengxunUrls):
    datehelper = DB()
    flag = 1
    for url in tengxunUrls:
        print(url[0])
        print(url)
        flag += 1
        if flag % 50 == 0:  #每30个休息一下子这个样子，
            print("正在进入休眠")
            time.sleep(60 * 2)
            # datehelper.refreshConnection()
        title, Hcontent, Tcontent, Acontent = fillTengxun.getPageContent(
            url[0])  #我要做的是把内容填上去，然后再更新
        if (title != "" and Hcontent != ""):
            print(url[0])
            print(url)
            datehelper.updateContent(url[0], title, Hcontent, Tcontent,
                                     Acontent)
            mixNews = MixNews()
            state = mixNews.__startToMix__()  #返回是否生成成功，
            if (state):  # 里面已经有那种写入数据库的操作了
                print("混合生成生成成功！")
                datehelper.updateState(url[0])

Example #6

Show file

class wangyiPageContent:
    def __init__(self):
        self.dbhelper = DB()

    def stripImgUrl(self, replacedSrc):
        if (replacedSrc.find(":") != -1):
            replacedSrc = replacedSrc.replace(":", "_")
        if (replacedSrc.find("：") != -1):
            replacedSrc = replacedSrc.replace("：", "_")
        if (replacedSrc.find(".") != -1):
            replacedSrc = replacedSrc.replace(".", "_")
        if (replacedSrc.find("/") != -1):
            replacedSrc = replacedSrc.replace("/", "_")
        if (replacedSrc.find("-") != -1):
            replacedSrc = replacedSrc.replace("-", "_")
        if (replacedSrc.find("?") != -1):
            replacedSrc = replacedSrc.split("?")[0]
        if (replacedSrc.find("？") != -1):
            replacedSrc = replacedSrc.replace("？", "_")
        if (replacedSrc.find("！") != -1):
            replacedSrc = replacedSrc.replace("！", "_")
        if (replacedSrc.find("\"") != -1):
            replacedSrc = replacedSrc.replace("\"", "_")
        if (replacedSrc.find(" ") != -1):
            replacedSrc = replacedSrc.replace(" ", "")
        if (replacedSrc.find("“") != -1):
            replacedSrc = replacedSrc.replace("“", "")
        if (replacedSrc.find("”") != -1):
            replacedSrc = replacedSrc.replace("”", "")
        if (replacedSrc.find("：") != -1):
            replacedSrc = replacedSrc.replace("：", "")
        if (replacedSrc.find("|") != -1):
            replacedSrc = replacedSrc.replace("|", "_")

        return replacedSrc

    def fixCssdot(self, pContent):
        # print(pContent)
        if (pContent.find("'")) != -1:  #找到有单引号的，这种多是样式的东西
            replaceString = pContent.replace("'", '"')
            # print(replaceString)
            return replaceString
        elif pContent.find("\n"):
            return pContent.replace("\n", "")

        else:
            return pContent

    # def getPageContent(self,url): #输入一个url获得这个页面的本地化后的文章
    #     t = time.time()
    #     timeStamp =str(int(round(t * 1000)))  # 毫秒级时间戳
    #     time.sleep(1)
    #     downhelp = Download(r'/home/default/images') #设置下载路径
    #
    #     dbhelper = DB()
    #
    #     title,Hcontent,Tcontent,Acontent="","","",""  #要返回的这几个东西
    #     simpleP = []
    #     soup =makeBS().makesoup(url)
    #     # print(soup.prettify())
    #     title = soup.find("h1",attrs={"class":True})
    #     print("标题 ",title)
    #     if (title!=None):
    #         title  = title.text
    #         if(title.find(":")!=-1):
    #             title = title.replace(":","")
    #
    #     main_content =  soup.find("div",attrs={"class":"post_text"})
    #     if (main_content != None):
    #         allPP = main_content.find_all("p")
    #         for p in range(0, len(allPP)):
    #             localImgList = allPP[p].find_all("img", attrs={"src": True})  # 每个p标签内的img提取和修改链接本地化
    #             if (p == 0):
    #                 Hcontent = allPP[p]
    #             if (localImgList != None):  # 找到有的话就遍历
    #                 for img in localImgList:
    #                     if img != None:
    #                         print(img)
    #                         print(img['src'])
    #                         if (img['src'].find("//") == 0):  # 什么都没有，协议路径改成https
    #                             imgSrc = "https:" + img['src']
    #                             # filename = os.path.basename(imgSrc)
    #                             print(imgSrc)
    #                             # imgName = timeStamp + self.stripImgUrl(imgSrc)
    #                             now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                             now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                             imgName = now_date + self.stripImgUrl(imgSrc)
    #
    #                             print("文件名是" + imgName)
    #                             downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now)  # 下载这个是没问题的
    #                             img['src'] = "/images/" + imgName + ".jpg"  # 修改完后的img
    #                             print(img['src'])
    #
    #                             simpleP.append(allPP[p])
    #                             Acontent += str(allPP[p])
    #
    #                             # Acontent += str(allcontent[i])
    #                         elif (img['src'].find("https:") == 0):  # 本来就有找到有https协议
    #                             imgSrc = img['src']
    #                             # filename = os.path.basename(imgSrc)
    #                             # print(imgSrc)
    #
    #                             now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                             now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                             imgName = now_date + self.stripImgUrl(imgSrc)
    #
    #                             print("文件名是" + imgName)
    #                             downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now)
    #                             img['src'] = "/images/" + imgName + ".jpg"
    #                             print(img['src'])
    #                             simpleP.append(allPP[p])
    #                             Acontent += str(allPP[p])
    #
    #                         else:  # 那这个就是http协议了
    #                             imgSrc = img['src']
    #                             # filename = os.path.basename(imgSrc)
    #                             # print(imgSrc)
    #
    #                             now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                             now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                             imgName = now_date + self.stripImgUrl(imgSrc)
    #
    #                             print("文件名是" + imgName)
    #                             downhelp.downloadImg(imgSrc, imgName=imgName, referer=None,now_date=now)
    #                             img['src'] = "/images/" + imgName + ".jpg"
    #                             print(img['src'])
    #                             simpleP.append(allPP[p])
    #                             Acontent += str(allPP[p])
    #
    #         for p in simpleP:  #耦合性极高
    #             dbhelper.insertSimpleP(p) #这儿这个是一样的
    #         print(title, Hcontent, Tcontent, Acontent)
    #         return title, Hcontent, Tcontent, Acontent
    #
    #
    #     else:
    #         title ="网易没找到标题"
    #         return title, Hcontent, Tcontent, Acontent


#---------------------------------------------------这儿的这个是新的了——----------------------------------------------

    def getNewsContent(self, url):  #解析图片类的,可以都放到类构造函数中去的
        title, Hcontent, Tcontent, Acontent = "", "", "", ""  #初始化一下
        allP = []  # 这个是装所有的段落
        simplePList = []  # 中间的段落
        downloadTool = Download(r'/home/default/images')  # 设置下载路径
        cooker = makeBS()
        soup = cooker.makesoup(url)  #soup 可能已经是空的了
        if soup == None:
            print("出现空的url ", url)
            return title, Hcontent, Tcontent, Acontent
        try:
            title = soup.find("head")  # 这个其实是head，是头部。
            # print(title)
            # time.sleep(60)
        except Exception as e:
            print(e)
            traceback.print_exc()  #这句用来告诉自己这儿需要跳出

            # print(title)
        # print("标题是")
        if title != None:  # 如果找到的话
            title = title.text.split("_")[0]
            print(title)
            # time.sleep(60)
            # title=self.fixCssdot(title)
            if self.dbhelper.ifExists(title):
                return title, Hcontent, Tcontent, Acontent  #存在的话，就不用再解析和下载图片了
            # print(title)
        else:
            print("没能找到标题，请检查网址 " + url)
            # print(soup)
            # print()
            return title, Hcontent, Tcontent, Acontent  # 不行就返回空的回去

        if url.find(
                "photoview") != -1:  #===============================解析图片滚动的页面的
            print("是图片集合的页面")
            # print(soup)
            for span in soup.find_all("img", attrs={"src": True}):
                print(span)
            return  #返回空白的回去，不要这个图

        else:  #--===================================================普通的文本图文的新闻
            pass  #这种是常规的新闻页面的解析
            # print(soup)

            pList = soup.find_all("p")
            checkLen = len(pList)

            for p in pList:  #解析常规的这几个东西
                # print(p)
                if p.img != None:
                    # print("图片段落")
                    # print(p)
                    try:
                        imgSrc = p.img['src']
                        imgName = imgSrc.replace("https://inews.gtimg.com/",
                                                 "").replace("/", "_")
                        now = time.strftime('%Y%m%d',
                                            time.localtime(time.time()))
                        now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                        imgName = now_date + self.stripImgUrl(
                            imgName)  # 这儿已经是添加了时间的 了

                        # print("文件名是" + imgName)                                # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                        downloadTool.downloadImg(imgSrc,
                                                 imgName=imgName,
                                                 referer=None,
                                                 now_date=now)

                        imgPScr = "/images/" + imgName + ".jpg"  # 这个html格式图片地址
                        HtmlImg = '<p><img src="' + imgPScr + '"/></p>'
                        # print(HtmlImg)                                          #这个是修好的图片的文本的
                        allP.append(HtmlImg)  #放入列表中去

                    except Exception as e:
                        print(e)
                        print("找不到图片的地址")
                        print(p)
                        print()

                else:  #这就是普通的段落
                    if p.text != "用微信扫码二维码" and p.text != "分享至好友和朋友圈" and p.text != "" and p.text != "\n":
                        # print("普通段落")
                        if p.a == None:  #这儿开始判断是不是第一段
                            # print(p)
                            allP.append('<p>' + p.text +
                                        '</p>')  #只要文字进来，不要样式之类的

        # 最后再来判断首尾，这个也行的哈，对吧
        for p in allP:
            if (len(allP)) >= 2:
                Acontent = Acontent + str(p)
                if allP.index(p) == 0:  # 如果发现索引值是第一个的话，那就是开头了
                    Hcontent = p
                elif allP.index(p) == len(allP) - 1:  # 如果是最后一个句子，那么你就是结尾了
                    Tcontent = p
                else:  # 不是首段和尾端的端口才加入到零散的段落中去
                    simplePList.append(p)  #这儿存各种各样的列表

            else:  #如果这个也是一整段的新闻的话
                Acontent = Acontent + str(p)
                try:
                    Tcontent = "<p>" + p.split(
                        "。")[-2] + "</p>"  # 最后一句作为 结尾的句子，句号前面那个才是
                except Exception as e:
                    Tcontent = "<p>" + p.split(
                        "。")[0] + "</p>"  # 无法分的话，比如一句话，那就头尾都一样把
                Hcontent = "<p>" + p.split("。")[0] + "</p>"  # 这儿是开头的第一句的句子  ,
                simplePList.append(p)
        # print("title")
        # print(title)
        # print("Hcontent")
        # print(Hcontent)
        # print("Tcontent")
        # print(Tcontent)
        # print("Acontent")
        # print(Acontent)
        return self.fixCssdot(title), self.fixCssdot(Hcontent), self.fixCssdot(
            Tcontent), self.fixCssdot(Acontent)  #最后进行返回这些东西进来这儿

Example #7

Show file

        # print("合成的html文件在这儿")
        mixP  = title+header+allHtml+tail
        # print("本文总长"+str(len(new_news)))
        # print(title+header+allHtml+tail)

        # f= open(r"D:\pyfile\ProxySpider\newMission\new.html","w")
        # f.write(new_news)  ##多媒体文件要是用conctent哦！
        # f.close()/
        # print("生成的标题是  "+title  )
        # print("生成的内容是 "+mixP)

        if(title!="" and mixP !=""):
            datahelper.insertMixP(title,mixP)  #生成成功的话就是这样
            datahelper.updateMixState(id)  #这儿无法更新的  ,这儿自带更新了
            # print("生成成功")
            return True

        else:
            print("生成混合失败，没有找到title不为空的url")
            # datahelper.deleteUrl()
            return False
        # print(len(datahelper.__query__("select * from tengxun where title!='';")))


if __name__ == "__main__":  #这个就是url的东西
    dbhelper = DB()
    # ddd = MixNews()
    # ddd.__startToMix__()
    title, header, tail, id = dbhelper.__randomHandT__()

    print(title)

Example #8

Show file

    def getEveryDayWangyi(self):
        dbhelper = DB()
        dateurl = DateUrl()
        oneContent = wangyiPageContent()
        print("共提取到新闻url的数量有")
        now_date = (date.today() + timedelta(days=-1)).strftime(
            "%Y-%m-%d")  # 昨天日期
        print(now_date)
        # print(dateurl.getOneDayNewUrl("2018-07-03"))
        #1.页面新闻url写入数据库

        dateurl.getRollUrlList(
            now_date)  # 1.这个就是当天的，和凤凰一样，老样子啊，获得了链接后直接可以写入数据库中去了

        todayNewUrl = dbhelper.__query__(
            "select url from tengxun where urlState='False' and fromWhere='wangyi';"
        )  # 只要数据库中取出需要读取的url
        # print(type(todayNewUrl))
        print(len(todayNewUrl))  #这个才是打开来的东西

        urlNumer = len(todayNewUrl)
        print("正在打开网易的新闻的东西")
        print(todayNewUrl)

        # print("正在打开网易的新闻的东西")
        # print(todayNewUrl)

        # for newUrl in todayNewUrl:  # 2.然后把内容段落写入数据库
        #     title, Hcontent, Tcontent, Acontent = oneContent.getPageContent(newUrl)
        #     if (title != "网易没找到标题" and title != None and Hcontent != ""):  # 有内容的时候就更新这条数据
        #         dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent)
        #
        #         print("正在生成新混合新闻。。。")  # 3. 然后是把页面页写入数据库，再然后是随机生成相同数量的
        #         mixNews = MixNews()
        #         mixNews.__startToMix__()  # 调用一次就执行一次，可以修改返回的状态
        #
        #     else:
        #         print("更新失败，标题提取失败，为空")
        count = 1  #计数，每100个就休息1分钟
        for dic in todayNewUrl:
            url = dic['url']  #2.把写入数据库的这几个新闻url的内容提取出来
            if count % 200 == 0:  #突然的中断应该是因为这边连接不上那儿，所以应该问题不大，每两百条休息一分钟
                time.sleep(60)
            count += 1

            #这儿的url是未转换成xw。电脑原版页面的url，所以，存的是这种url
            title, Hcontent, Tcontent, Acontent = oneContent.getNewsContent(
                url)  #这儿漏了更新到url中去  ,自动转换成xw的然后再下载
            time.sleep(1)
            # print(title, Hcontent, Tcontent, Acontent)
            if (title != "腾讯没找到标题" and title != None
                    and Hcontent != ""):  #有内容的时候就更新这条数据
                # print("要更新的url是 "+url)
                resultState = dbhelper.updateContent(url, title, Hcontent,
                                                     Tcontent,
                                                     Acontent)  #要删除的是更新失败的那个
                if resultState == False:  #更新成功
                    print("更新失败，正在删除这个重复的url")
                    print(url)
                    # dbhelper.deleteUrl(url)  # 按url把这条记录删除掉咯,生成失败也不需要删除这个拉，
                    print()
                else:
                    # print("正在生成新混合新闻。。。")  # 3. 然后是把页面页写入数据库，再然后是随机生成相同数量的
                    mixNews = MixNews()
                    if mixNews.__startToMix__() != True:  # 调用一次就执行一次，可以修改返回的状态
                        print("生成失败，已经没有刚填满的未用过的文章了")
                        print(url)
                        dbhelper.deleteUrl(
                            url)  # 如何这个内容为空也要删除，（可能前面一个步骤更新的时候发现相同的标题，所以插入不了），
                        # print()
                    else:
                        print(True)
            else:
                print("打开页面提取失败,可能是页面为404网易，正在删除这条url " + url)  #为空的话，那么就删除这条把
                dbhelper.deleteUrl(
                    url)  #按url把这条记录删除掉咯 todo don't delete it first

Example #9

Show file

File: fenghuangPageContent.py Project: realzhengyiming/LocalNews

    def getPageContent(self,
                       url):  #输入一个url获得这个页面的本地化后的文章   ,其实分这个应该是不需要那么麻烦的把，，，，
        t = time.time()
        timeStamp = str(int(round(t * 1000)))  # 毫秒级时间戳
        time.sleep(1)

        downhelp = Download(r'/home/default/images')  #设置下载路径
        dbhelper = DB()

        title, Hcontent, Tcontent, Acontent = "", "", "", ""
        simpleP = []
        soup = None
        try:
            soup = makeBS().makesoup(url)

            # print(soup)

            title = soup.find("head").title
            # print(chardet.detect(title.text))   测不准的这个东西
            # print(title)

            if title != None:
                title = title.text
                print("标题是" + str(title))
            main_content = soup.find("div", attrs={"id": "main_content"})
            flag = 1
            if (main_content != None):
                allPP = main_content.find_all("p")
                for p in range(0, len(allPP)):
                    # print()
                    # print(allPP[p])
                    # print(p)
                    if allPP[p].find_all("a") != None:
                        # print("找到了有链接的东西")
                        # print(allPP[p])
                        allPP[p].a.extract()
                        # print(allPP[p])

                    localImgList = allPP[p].find_all(
                        "img", attrs={"src": True})  # 每个p标签内的img提取和修改链接本地化

                    if (localImgList != None):  # 找到有的话就遍历，并且本地化还有修改src的东西
                        for img in localImgList:
                            if img != None:
                                # print("发现图片")
                                # print(img)
                                # print(img['src'])
                                if (img['src'].find("//") == 0
                                    ):  # 什么都没有，协议路径改成https
                                    imgSrc = "https:" + img['src']
                                    # print(imgSrc)

                                    now = time.strftime(
                                        '%Y%m%d', time.localtime(time.time()))
                                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                                    imgName = now_date + self.stripImgUrl(
                                        imgSrc)

                                    # print("文件名是" + imgName)

                                    # print(imgName)
                                    downhelp.downloadImg(
                                        imgSrc,
                                        imgName=imgName,
                                        referer=None,
                                        now_date=now)  # 下载这个是没问题的
                                    img['src'] = "/images/" + imgName + ".jpg"  #修改完后的img
                                    # print(img['src'])

                                    simpleP.append(allPP[p])
                                    Acontent += str(allPP[p])

                                    # Acontent += str(allcontent[i])
                                elif (img['src'].find("https:") == 0
                                      ):  # 本来就有找到有https协议
                                    imgSrc = img['src']
                                    # print(imgSrc)

                                    now = time.strftime(
                                        '%Y%m%d', time.localtime(time.time()))
                                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                                    imgName = now_date + self.stripImgUrl(
                                        imgSrc)

                                    downhelp.downloadImg(imgSrc,
                                                         imgName=imgName,
                                                         referer=None,
                                                         now_date=now)
                                    img['src'] = "/images/" + imgName + ".jpg"
                                    # print(img['src'])
                                    simpleP.append(allPP[p])
                                    Acontent += str(allPP[p])

                                else:  # 那这个就是http协议了
                                    imgSrc = img['src']
                                    # print(imgName)

                                    now = time.strftime(
                                        '%Y%m%d', time.localtime(time.time()))
                                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                                    imgName = now_date + self.stripImgUrl(
                                        imgSrc)

                                    downhelp.downloadImg(imgSrc,
                                                         imgName=imgName,
                                                         referer=None,
                                                         now_date=now)
                                    img['src'] = "/images/" + imgName + ".jpg"
                                    # print(img['src'])
                                    simpleP.append(allPP[p])
                                    Acontent += str(allPP[p])
                    if (p == 0):  #这儿是判断是首段还是尾段
                        Hcontent = allPP[p]  #这个是找标题的,这个是修改后的了
                    elif (p == len(allPP) - 1):
                        print("找到尾段了拉")
                        Tcontent = allPP[p]

                # for p in simpleP:
                # dbhelper.insertSimpleP(p) #这儿这个是一样的  todo 记得改回来这儿的地方
                return title, Hcontent, Tcontent, Acontent
            else:
                title = "凤凰没有找到标题"
                return title, Hcontent, Tcontent, Acontent
        except Exception as e:
            print(e)
            print("现在网页是：" + url)
            return title, Hcontent, Tcontent, Acontent

Example #10

Show file

File: everyDayFenghuang.py Project: realzhengyiming/LocalNews

    def getEveryFenghuang(self):
        dbhelper = DB()
        dateurl = fenghuangDateUrls()
        oneContent = fenghuangPageContent()
        print("共提取到新闻url的数量有")
        now_date = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")  # 昨天日期
        print(now_date)

        # print(dateurl.getOneDayNewUrl("2018-07-03"))
        #1.页面新闻url写入数据库
        todayNewUrl = dateurl.getUrlLists(now_date)   #1.这个就是当天的
        urlNumer = len(todayNewUrl)

        todayNewUrl = dbhelper.__query__("select url from tengxun where urlState='False' and fromWhere='fenghuang'")  #只要数据库中未填补内容的url
        print(type(todayNewUrl))
        print(len(todayNewUrl))
        # for dic in todayNewUrl:
        #     dic['url']

        print("")

        # 这儿才是把东西提取出来
        count = 1                                                             #计数，每100个就休息1分钟
        print(todayNewUrl)
        flagNumber = 1
        mixNumber = 0
        for dic in todayNewUrl:
            newUrl = dic['url'] #2.把写入数据库的这几个新闻url的内容提取出来
            if newUrl.find("pl.ifeng.com")!=-1:
                title, Hcontent, Tcontent, Acontent = oneContent.getPlContent(newUrl)
                if (title != "凤凰没有找到标题" and title != None and Hcontent != ""):  # 有内容的时候就更新这条数据
                    dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent)

                    print("正在生成新混合新闻。。。")  # 3. 然后是把页面页写入数据库，再然后是随机生成相同数量的
                    mixNews = MixNews()
                    if mixNews.__startToMix__() != True:  # 调用一次就执行一次，可以修改返回的状态
                        print("生成失败，已经没有刚填满的未用过的文章了")
                        print(newUrl)
                        dbhelper.deleteUrl(newUrl)  # 如何这个内容为空也要删除，（可能前面一个步骤更新的时候发现相同的标题，所以插入不了），
                    else:
                        mixNumber+=1    #成功就生成一个累加
                else:
                    print("更新失败，标题提取失败，为空")
                    dbhelper.deleteUrl(newUrl)  # 按url把这条记录删除掉咯
            else:    #这个就是默认的那个新闻news.ifeng.com
                title, Hcontent, Tcontent, Acontent =oneContent.getNewsContent(newUrl)
                if (title != "凤凰没有找到标题" and title != None and Hcontent != ""):  # 有内容的时候就更新这条数据
                    dbhelper.updateContent(newUrl, title, Hcontent, Tcontent, Acontent)

                    print("正在生成新混合新闻。。。")  # 3. 然后是把页面页写入数据库，再然后是随机生成相同数量的
                    mixNews = MixNews()
                    if mixNews.__startToMix__() != True:  # 调用一次就执行一次，可以修改返回的状态
                        print("生成失败，已经没有刚填满的未用过的文章了")
                        print(newUrl)
                        dbhelper.deleteUrl(newUrl)  # 如何这个内容为空也要删除，（可能前面一个步骤更新的时候发现相同的标题，所以插入不了），
                    else:
                        mixNumber+=1    #成功就生成一个累加
                else:
                    print("更新失败，标题提取失败，为空")
                    dbhelper.deleteUrl(newUrl)  # 按url把这条记录删除掉咯
        print("目前生成了 共有那么多个混合的新闻  "+str(mixNumber))   #普遍存在

Example #11

Show file

File: wordtest.py Project: kingking888/LocalNews

#coding=utf-8
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from DBcontrol import DB

d = path.dirname(__file__)

# Read the whole text.
# text = open(path.join(d, 'test.text'),encoding='utf-8').read()

chak = DB()
allTogether = chak.getAllTitle()

import jieba


wordlist = jieba.cut(allTogether, cut_all=True)   #直接切换成了这个数据
wl = " ".join(wordlist)
print(wl)#输出分词之后的txt

coloring = np.array(Image.open(path.join(d, "test.jpg")))

# 设置停用词
# stopwords = set(STOPWORDS)
# stopwords.add("said")

Example #12

Show file

File: pageUrls.py Project: realzhengyiming/LocalNews

class DateUrl:
    def __init__(self):
        self.dbhelper = DB()  #默认就给你创建好了，
#这个是按天来的，组合一下就行了,获取滚动列表的新闻url

    def getDateUrlList(self, startDate, endDate):  #返回这两个日期区间的url,顺便就写入数据库了

        urlList = []
        timehelper = TimeHelper()
        datelist = []
        if (startDate != endDate):  #不相等的时候就算差值
            datelist = timehelper.getTimeList(startDate, endDate)
        else:
            datelist.append(startDate)
        for oneDay in datelist:
            time.sleep(1.5)  #500毫秒一次，那我设置成800毫秒请求一次
            onedatelist = []
            try:
                onedatelist = self.getOneDayNewUrl(oneDay)
            except Exception:
                time.sleep(30)
                onedatelist = self.getOneDayNewUrl(oneDay)
            urlList = urlList + onedatelist

            # todo 这样并不好，耦合性太高了，不方便平时的调试排错，融合进去了这些东西
            # self.saveListToMysql(onedatelist,oneDay,"tengxun")  #存到数据库里面去，把每个都插入进去
        return urlList

    def getOneDayNewUrl(self, date):
        date = parse.quote_plus("" + date)
        oneDayUrlList = []
        print(str(date))
        # date = "2018-07-26"
        appid = "3639833dae924cb9efb6ba30e6c5a6fa"
        url = "https://api.shenjian.io/?appid=" + appid + "&date=" + date
        # print(url)
        request = urllib.request.Request(url,
                                         headers={
                                             "Accept-Encoding": "gzip",
                                         })

        response = urllib.request.urlopen(request)
        gzipFile = gzip.GzipFile(fileobj=response)
        # print(gzipFile.read().decode('UTF-8'))
        jsonResult = json.loads(str(gzipFile.read().decode('UTF-8')))
        if "data" in jsonResult:
            print(jsonResult['data'])
            print("共有多少个新闻" + str(len(jsonResult['data'])))
            if (len(jsonResult['data']) == 4):
                oneDayUrlList.append(jsonResult['data']['url'])
                return oneDayUrlList
            else:
                for i in jsonResult['data']:
                    # print(i['url'])
                    oneDayUrlList.append(i['url'])
                return oneDayUrlList
        else:
            print("检测到腾讯的api 中无  data key 10分钟后再试")
            time.sleep(60 * 10)  #如果一下子那个api没有反应的话，那就这样操作咯，用进程把，多个cpu哦
            return self.getOneDayNewUrl(date)  #采用递归的方式来处理，，

    #
    # def saveListToMysql(self,lists,date,fromWhere):
    #     connect = DB()
    #     lists = list(set(lists))
    #     for i in lists:
    #         connect.insertTenxun(i,date,fromWhere)
    #     print(fromWhere+"插入完毕")
    # connect.__close__()
    def tengxunGundong(self):
        url = 'http://news.qq.com/articleList/rolls/'
        cooker = makeBS()
        soup = cooker.makesoup(url, "computer")

        print(soup)

# -----------------------------------------------------下面开始是新的提取出页面的url的-----------------------------------

    def returnThemeCode(self, theme):
        ent_Theme = 1537876288634
        sport_Theme = 1537877689177
        finance_Theme = 1537878365483
        tech_Theme = 1537879684280
        auto_Theme = 1537887032223
        house_Theme = 1537887128904
        news_Theme = 1537874915062

        if theme == 'news':
            return news_Theme
        if theme == 'ent':
            return ent_Theme
        if theme == 'sports':
            return sport_Theme

        if theme == 'tech':
            return tech_Theme
        if theme == 'auto':
            return auto_Theme
        if theme == 'house':
            return house_Theme
        if theme == 'finance':
            return finance_Theme

    def getThemeUrl(self, theme, today, pageNumber):
        rawUrl = "http://roll.news.qq.com/interface/cpcroll.php"
        rawReferer = '.qq.com/articleList/rolls/'  # 'http://news   前面还有这个东西

        my_headers = [
            'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
            'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'
        ]

        headers = {
            "User-Agent": random.choice(my_headers),
            'Referer': 'http://' + theme + rawReferer
        }  # 默认值

        rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str(
            self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str(
                pageNumber) + "&date=" + today
        try:
            rawhtml = requests.get(
                rawUrl, headers=headers, allow_redirects=False,
                timeout=30)  # 一般提取文本的话，那就用text，如果是文件就content
            rawhtml.encoding = chardet.detect(rawhtml.content)['encoding']

            print(rawhtml.url)
            print("状态码" + str(rawhtml.status_code))

            if rawhtml.status_code == 504:
                print(504)
                return
            print("页面的读取结果为")
            print(rawhtml.text)
            if rawhtml.text.find('rollback') == 0:
                jsonString = rawhtml.text.split("rollback")[1]  # 把js提取出来就可以了
            else:
                jsonString = rawhtml.text
            print(jsonString)
            dicData = eval(jsonString)
            print(type(jsonString))
            print(jsonString)
            # print(dicData['data']['article_info'])
            print(len(dicData['data']['article_info']))
            if dicData['data'] == "":
                print("超过了最大页数了，跳出了就可以了")
                return
            urllist = []
            for one in dicData['data']['article_info']:
                # print(one['url'])
                print(one['url'].replace("\\", "/"))  # 还需要检查一下这个和之前的那种野蛮是不是一样的
                urllist.append(one['url'].replace("\\", "/"))
            return urllist
        except Exception as e:
            # print(e)
            return []  # 没有东西诶

    def pageUrlMain(self, date):  #汇总到这儿来,输入日期就可以返回这个了

        # todo 这些都还可能是 page2的情况
        # url    ="http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537850539512"
        # 这个是时政新闻
        urlNew = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537874915062"
        # 这儿是国际标题， referer = http://news.qq.com/articleList/rolls/
        urlEnt = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=ent&mode=1&cata=&date=2018-09-25&page=1&_=1537876288634"  # referer = http://ent.qq.com/articleList/rolls/
        # referer = http://ent.qq.com/articleList/rolls/
        urlSport = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=sports&mode=1&cata=&date=2018-09-25&page=1&_=1537877689177"  # r这个好像而是动态加载出来的，真是的
        # todo 这个要重新解析，可能是动态的 referer = http://sports.qq.com/articleList/rolls/    有些page可能不止一个的，都有   体育的是动态的，待会回来再分析，有好多页，很厉害的样子呢
        # 不同的theme要切换不同的referer
        urlFinance = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=finance&mode=1&cata=&date=2018-09-25&page=1&_=1537878365483"
        # referer http://finance.qq.com/articleList/rolls/
        # todo 默认的解析可以，但是好慢，使用代理的情况下 http://173.255.210.215:8000/?count=6&country=%E5%9B%BD%E5%86%85
        urlTech = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=tech&mode=1&cata=&date=2018-09-25&page=2&_=1537879684280"
        # referer = http://tech.qq.com/articleList/rolls/
        # todo 这个也是偶尔有点慢的样子， 使用代理 下

        urlAuto = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=auto&mode=1&cata=&date=2018-09-25&page=1&_=1537887032223"
        # referer 这个是汽车的  http://auto.qq.com/articleList/rolls/dai'li
        # todo 这个汽车的应该使用另外的解析来解析才可以

        urlHouse = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=house&mode=1&cata=&date=2018-09-25&page=1&_=1537887128904"
        # referer http://house.qq.com/articleList/rolls/   这个是房产的 可以解析
        # getThemeUrl(urlSport,"http://sports.qq.com/articleList/rolls/")
        urlRaw = "http://roll.news.qq.com/interface/cpcroll.php?"  # 带参数进行post请求才对把
        # print(jsonDic)

        # print(rawhtml.json())

        oneUrl = {}  # 单单一个运动的就够了
        themeList = [
            'news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports'
        ]  #一共有7个主题，其实不止这7个的，真好。
        tempList = []
        #想想用什么数据类型。二维数组？url,theme, 都是字符串

        for theme in themeList:
            print("第一个主题是")
            for i in range(1, 11):
                print("第" + str(i) + "页")
                responseList = self.getThemeUrl(theme, date,
                                                i)  # 这儿只是测试最大页数有多少而已
                if len(responseList) == 0:
                    print("最大页数为" + str(i - 1) + "页")
                    break
                else:
                    tempList = tempList + responseList
                    oneUrl[theme] = responseList  #这样不就可以了，分开来
                print(oneUrl)
        resultUrl = oneUrl
        tempList = set(tempList)

        from pprint import pprint
        pprint(resultUrl)
        print(len(resultUrl))  # 目前可以返回653    昨天的 完整的1140，数量是很可观的
        #写一个函数分类存就可以了  ，本来是一起存的，现在

        self.dbhelper.saveDicToMysql(oneUrl, date, "tengxun")

        #这边不需要分类，可能是因为这个切换了又没放回去的原因
        # resultUrl = self.dbhelper.saveListToMysql(resultUrl,date,"tengxun")   #这儿没啥去重的  #保存到mysql中去先,这样就可以重复使用拉
        #经过去重，去掉已经插入过的东西，然后再返回去 ，这儿还得转会list才可以。
        return tempList  #直接这儿去重后

Example #13

Show file

class pageContent:
    def __init__(self):
        self.dbhelper = DB()
    def stripImgUrl(self,replacedSrc):
        if(replacedSrc.find(":")!=-1):
            replacedSrc = replacedSrc.replace(":","")
        if(replacedSrc.find("：")!=-1):
            replacedSrc = replacedSrc.replace("：","_")
        if(replacedSrc.find(".")!=-1):
            replacedSrc=replacedSrc.replace(".", "_")
        if(replacedSrc.find("/")!=-1):
            replacedSrc = replacedSrc.replace("/","_")
        if(replacedSrc.find("-")!=-1):
            replacedSrc = replacedSrc.replace("-","_")
        if(replacedSrc.find("?")!=-1):
            replacedSrc =replacedSrc.split("?")[0]
        if (replacedSrc.find("？") != -1):
            replacedSrc = replacedSrc.replace("？","_")
        if (replacedSrc.find("！") != -1):
            replacedSrc = replacedSrc.replace("！", "_")
        if (replacedSrc.find("\"") != -1):
            replacedSrc = replacedSrc.replace("\"","_")
        if (replacedSrc.find(" ") != -1):
            replacedSrc = replacedSrc.replace(" ","")
        if (replacedSrc.find("“") != -1):
            replacedSrc = replacedSrc.replace("“","")
        if (replacedSrc.find("”") != -1):
            replacedSrc = replacedSrc.replace("”","")
        if (replacedSrc.find("：") != -1):
            replacedSrc = replacedSrc.replace("：", "")
        if (replacedSrc.find("|") != -1):
            replacedSrc = replacedSrc.replace("|", "_")
        return replacedSrc


    # def getPageContent(self,url): #输入一个url获得这个页面的本地化后的文章，  这个好像是没有保存东西进来这里面
    #     # time.sleep(1)
    #
    #     t = time.time()
    #     Acontent = ""
    #     Hcontent,Tcontent = "",""
    #     timeStamp =str(int(round(t * 1000)))  # 毫秒级时间戳
    #
    #     dbhelper = DB()
    #     simpleP = []
    #
    #     soup =makeBS().makesoup(url)
    #     if soup==None:
    #         title ="这个网页打开超时了"
    #         return  title, Hcontent, Tcontent, Acontent
    #     # print(soup.prettify())
    #     title = soup.find("h1")
    #     if(title!=None):
    #         title = title.text
    #     # print("标题 ",title)  #找不到标题的话说明是
    #     # if (title.find(":")!=-1):
    #     #     title  = title.replace(":","")
    #     downloadTool = Download(r'/home/default/images') #设置下载路径
    #
    #     totalLong = 0
    #     mainDiv =  soup.find("div",attrs={"class":"qq_innerMain clearfix"})
    #     # print(mainDiv)
    #     if(mainDiv==None):
    #         dbhelper.deleteUrl(url)  #如果无法找到标题，那么就是图文新闻，舍弃因为文字太少，不要了
    #         title = "腾讯没找到标题"  #直接就全部置空，当作没访问过这个就可以了
    #         print("错误，此网页是图文网页 "+url)
    #         return title, Hcontent, Tcontent, Acontent
    #     allcontent = mainDiv.find_all("p")
    #     # print(allcontent)
    #     for i in range(len(allcontent)):  #这里面这个是p
    #         # print(i)
    #         if(i==0):
    #             Hcontent =allcontent[i]
    #         localImgList = allcontent[i].find_all("img",attrs={"src":True}) #每个p标签内的img提取和修改链接本地化
    #         if(localImgList!=None):  #找到有的话就遍历
    #             for img in localImgList:
    #                 if img!=None:
    #                     # print(img['src'])
    #                     if(img['src'].find("//")==0):  #什么都没有，协议路径改成https
    #                         imgSrc  = "https:"+img['src']
    #                         # filename = os.path.basename(imgSrc)
    #                         # print(imgSrc)
    #                         imgName = imgSrc.replace("https://inews.gtimg.com/","").replace("/","_")
    #
    #                         now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                         now_date = now+"/"   #后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                         imgName = now_date+self.stripImgUrl(imgName)
    #
    #                         # print("文件名是 " + imgName)
    #                         #这儿这个是图片的格式
    #                         newImgName = downloadTool.downloadImg(imgSrc,imgName=imgName,referer=None, now_date=now)  #下载这个是没问题的
    #                         img['src']="/images/"+imgName+".jpg"  #以这个文件名下载，以这个文件名src
    #                         # print(img['src'])
    #                         # print("图片的链接有"+imgSrc)
    #                         # print(allcontent[i])
    #                         simpleP.append(allcontent[i])
    #                         Acontent += str(allcontent[i])
    #                     elif(img['src'].find("https:")==0):   #本来就有找到有https协议,   3选1 而不是反复操作
    #                         imgSrc = img['src']
    #                         # filename = os.path.basename(imgSrc)
    #                         # print(imgSrc)
    #                         imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_")
    #
    #                         now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                         now_date = now+"/"   #后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                         imgName = now_date+self.stripImgUrl(imgName)
    #
    #                         print("文件名是" + imgName)
    #                         downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now)
    #                         img['src'] = "/images/" + imgSrc.replace("https://inews.gtimg.com/", "").replace("/","_") + ".jpg"
    #                         # print(img['src'])
    #                         # print("图片的链接有"+imgSrc)
    #                         # print(allcontent[i])
    #                         simpleP.append(allcontent[i])
    #                         Acontent += str(allcontent[i])
    #                     else:   #那这个就是http协议了
    #                         imgSrc = img['src']
    #                         # filename = os.path.basename(imgSrc)
    #                         # print(imgSrc)
    #                         imgName = imgSrc.replace("http://inews.gtimg.com/", "").replace("/", "_")
    #
    #
    #                         now = time.strftime('%Y%m%d', time.localtime(time.time()))
    #                         now_date = now+"/"   #后面下载的文件名是不需要带杠的，后面就不需要带杠杠
    #                         imgName = now_date+self.stripImgUrl(imgName)
    #
    #                         # print("文件名是" + imgName)
    #                         downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now)
    #                         img['src'] = "/images/" + imgSrc.replace("http://inews.gtimg.com/", "").replace("/","_") + ".jpg"
    #                         # print(img['src'])
    #                         # print("图片的链接有"+imgSrc)
    #                         # print(allcontent[i])
    #                         simpleP.append(allcontent[i])
    #                         Acontent += str(allcontent[i])
    #
    #
    #
    #         if (allcontent[i].string=="更多新闻" or allcontent[i].string =="扫一扫，用手机看新闻！"):
    #             Tcontent =allcontent[i-1]
    #             break
    #         else:
    #             Tcontent = allcontent[-1 ]  #要么是倒数第一位要么是截断的那一位
    #         if (allcontent[i].string!=None and allcontent[i].string!="扫一扫，用手机看新闻！"):
    #             # print(allcontent[i])
    #             Acontent += str(allcontent[i])
    #             simpleP.append(allcontent[i])
    #             totalLong = len(allcontent[i].string) + totalLong
    #
    #     # for p in simpleP:
    #     #
    #     #     dbhelper.insertSimpleP(p)  #插入段落，但是没有更新标题还有那些东西，发现没有，对啊，这个东西要修改好，这儿转移出去处理把，逻辑分明一点而
    #
    #
    #     print("此文章总共多少个字"+str(totalLong))
    #     return title,Hcontent,Tcontent,Acontent
    #
    # #——————————————下面开始是全新的使用转化成wx 手机端页面的爬取，和前面的区别开来---------------------------前面的没有用对吧。



    def findVideo(self,dic):    #传入解析到一半的
        if dic.find("ext_data: ")!=-1:  #有视频的时候才来解析这个东西
            try:
                if dic.find("vid")!=-1:     #找到这个字段
                # dic = dic.replace('"',"'").replace(",","")
                    dic = dic.split("ext_data: ")[1].split("fisoriginal")[0]
                    vid = dic.split("vid")[1].split(",")[0]
                    vid  = vid.split('"')[2]
                    # print(vid)
                    return vid
            except Exception as e:
                print(e)
                return
            return dic
        else:
            return





    def fixUrl(self,url):
        url = url.replace("////","//")
        # print(url)
        return url

    def changeNewToWx(self,url):  #把链接转化成手机端的，微信的，然后返回合成的微信的链接，或者直接通过请求头来自动跳到手机端
        #提取到的两杠的要换成一杠才可以的。
        # print(url)
        if url.find("////")!=-1:
            url = self.fixUrl (url)
            # print(url)
        rawWxUrl = "https://xw.qq.com/"

        if url.find("qq.com/")!=-1:
            splitUrl = url.split(".qq.com/")
            tailUrl = splitUrl[1].replace(".htm","")
            headUrl = splitUrl[0]           #为了提取出主题theme

            if headUrl.find("//")!=-1:   #提取主题
                theme=headUrl.split("//")[1]
                tailUrl = tailUrl.split("//")[0]+"/"+theme+"/"+tailUrl.split("//")[1]+tailUrl.split("//")[2]
            else:
                return                                                                 #如果不是的话，那就返回空值把
            # print("转化后url为 -"+rawWxUrl+tailUrl)
            sumUrl = rawWxUrl+tailUrl
            # print(sumUrl)
            # print(rawWxUrl)
            # print(tailUrl)
            #为了让这个不影响之前的，那就先检查一下第二个位置是否为“//”是的话就改成/就可以了

            if len(sumUrl.split("//"))>1: #为1就没问题
                tempUrl = sumUrl.split("//")
                totalUrl = tempUrl[0]+"//"+ "/".join(tempUrl[1:])
                # print(totalUrl)
                return(totalUrl)


            # tempWxUrl = rawWxUrl + tailUrl
            # print(tempWxUrl)

            return sumUrl

    def getWxContentNew(self,url):
        wxUrl = self.changeNewToWx(url)
        Cooker = makeBS()
        title, Hcontent, Tcontent, Acontent = "", "", "", ""  # 最后一个参数好像没什么用
        downloadTool = Download(r'/home/default/images')  # 设置下载路径
        BS = Cooker.makesoup(wxUrl) #传进来是微信的才可以一


    def getWxContent(self,wxUrl):#  先提取出里面的那个，然后再看看什么情况  ----这个是主要解析的函数
        time.sleep(1)

        title, Hcontent, Tcontent, Acontent = "", "", "", ""  # 最后一个参数好像没什么用
        downloadTool = Download(r'/home/default/images')  # 设置下载路径
        simplePList = []

        Cooker = makeBS()
        BS = Cooker.makesoup(wxUrl) #传进来是微信的才可以一
        # print(BS)
        if BS==None:
            return title, Hcontent, Tcontent, Acontent
        # print(BS)
        try:
            title = BS.find("head").title    #直接选用head这儿的title标签里面的东西
        except Exception as e:
            print(e)
            traceback.print_exc()  # 貌似这个，一个错，各个错。
            return title, Hcontent, Tcontent, Acontent

        # print("输出title")

        #todo 这儿插入一个检查一下标题是否存在的东西的

        # print(title.text)                 #todo 改变成手机的ua，让服务器发来手机端页面来给我, 我怀疑现在ua打开的还是电脑版本的
        if(title!=None and title.text!="404 - 腾讯网"):
            title=title.text
            if self.dbhelper.ifExists(title):  #如果标题为空那就直接删除这条了。 ，所以就是因为已经存在，所以后面都不进行读取了对吧
                print("已经存在了这个")
               #这儿也是返回空值的。
                return title,Hcontent,Tcontent,Acontent  #存在的话，就不用再解析和下载图片了  ,如果只有标题没有别的，很可能是这个新闻标题已经存在
        else:
            print("此新闻可能已经被删除，提取失败")
            return title,Hcontent,Tcontent,Acontent

        dicString = ""
        '''
        测试区域
        '''
        print(BS)

        for script in BS.find_all("script", attrs={"async": False}):  # 这边应该找内容不为空的
            if script.text!=None and script.text!="":
                print(script.text)
                dicString = script.text
                break



        # dicString = BS.find("script", attrs={"async": False}).text  #这边应该找内容不为空的
        # print(dicString)
        print(dicString)
        dic = dicString.replace("var globalConfig =", "").replace(";", "")
        # print(dic)
        tempDic = dic
        print("解析的文章的部分-》")  # 这边已经是空的了//todo 把腾讯的这个分析的东西再搞一下，应该还是解析页面里面除到问题就是这个文件里面的。
        print(tempDic)
        if dic.find("contents: ") != -1:

            datalist = dic.split("contents: ")[1].split("ext_data")[0].replace("[", "").replace("],", "")
            # print("这边开始这样")
            #这个是新加的。
            print(datalist)

            try:
                dic = eval("(" + datalist + ")")  #因为加了这个才能转化成那样,这个应该也是没问题才对。
                # print(dic)
            except Exception as e:
                print("转化成json出错")
                print(e)
                traceback.print_exc()  # 貌似这个，一个错，各个错。
                return title,Hcontent,Tcontent,Acontent  #存在的话，就不用再解析和下载图片了

                #return #返回空内容给他们咯，自动舍弃这个东西，然后那边要处理了一下，空内容的情况，这个不对啊。
            checkLen = len(dic)
            pprint.pprint(dic)
            print(checkLen)  #这儿需要考虑只有一个句子的情况，这个情况下是长度为2，这个情况下不是列表了，只是一个字典
            if(checkLen>2):
                for p in dic:   #遍历每一个，然后尽心判断应该也是没问题才对的。
                    try:  #traceback.print_exc()
                        if p['type'] == 1:  # 这个是一个句子，那就直接插进去就可以了
                            # 当成句子插入

                            pContent = p['value']
                            phtml='<p>'+pContent+"</p>"

                            Acontent =Acontent+phtml

                            if dic.index(p)==0:                      #如果发现索引值是第一个的话，那就是开头了
                                Hcontent= phtml
                                # print("find the header p")
                                # print(phtml)
                            elif dic.index(p)==checkLen-1 :            #同理如果是最后一个句子，那么你就是结尾了
                                Tcontent = phtml
                            else:  # 不是首段和尾端的端口才加入到零散的段落中去
                                simplePList.append(phtml)



                        # print(phtml)
                        if p['type'] == 2:
                            imgSrc = p['value'].replace("\/", "/")                    #图片的真实下载地址
                            # print(imgSrc)
                            imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_")
                            now = time.strftime('%Y%m%d', time.localtime(time.time()))
                            now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                            imgName = now_date + self.stripImgUrl(imgName)   #这儿已经是添加了时间的 了

                            # print("文件名是" + imgName)                                # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                            downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now)

                            # now_date = now + "/"                                     # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                            # imgName = now_date + self.stripImgUrl(imgName)   #已经处理好了
                            imgPScr = "/images/" + imgName + ".jpg"                 # 这个html格式图片地址
                            HtmlImg = '<p><img src="'+imgPScr+'"/></p>'

                            # print(imgPScr)

                            Acontent = Acontent + HtmlImg
                            # print("这个是图片了 "+HtmlImg)

                            if dic.index(p)==0:                      #如果发现索引值是第一个的话，那就是开头了
                                # print("find the header p")
                                # print(HtmlImg)
                                Hcontent= HtmlImg
                            elif dic.index(p)==checkLen-1 :            #如果是最后一个句子，那么你就是结尾了
                                Tcontent = HtmlImg
                            else:  # 不是首段和尾端的端口才加入到零散的段落中去
                                simplePList.append(HtmlImg)

                        if p['type'] == 3:  # 这个是视频的情况，直接提取出来当成句子好了，这儿还有图片的，先不管了
                            try:
                                pContent = p['value']['desc']
                                pContent = "<p>"+ pContent+"</p>"
                                # 解析视频
                                vid = self.findVideo(tempDic)
                                rawVideoString = ""
                                if vid != None:
                                    rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>'

                                if dic.index(p) == 0:  # 如果发现索引值是第一个的话，那就是开头了
                                    # print("find the header p")
                                    # print(pContent)
                                    Hcontent = pContent+rawVideoString
                                elif dic.index(p) == checkLen - 1:  # 如果是最后一个句子，那么你就是结尾了
                                    Tcontent =pContent+rawVideoString
                                else:                              #不是首段和尾端的端口才加入到零散的段落中去
                                    simplePList.append(pContent)

                            except Exception as e:
                                pass  #舍弃这个段落
                    except Exception as e:
                        print(e)
                        traceback.print_exc()   #貌似这个，一个错，各个错。
                #插入数据库先
                # for p in simplePList:
                #     self.dbhelper.insertSimpleP(p)  #插入段落，但是没有更新标题还有那些东西

                Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
                return title, Hcontent, Tcontent, Acontent

            else :                                                         #这儿是只有一个句子的网页的情况下  ，这边的一个大段落的，首段尾段的拆分是没问题的
                p= dic                                                    #这样转换一下就可以了
                # print(type(dic))
                # print(dic)
                # print(p)
                if type(p)==tuple:
                    print("是tuple")
                    try:
                        # print("长度是")
                        # print(len(p))
                        if(len(p)==1):         #又加了一种，怕不够长的处理
                            p=p[0] # 如果是里面只有一个的话，那就提取出里面的一个元素就可以了，
                        if(len(p)==2):
                            p=p[1] # 如果是里面只有一个的话，那就提取出里面的一个元素就可以了，
                        else:                           #长度为空，里面除了标题，连文字内容都没有的这种。。。
                            p={'type':3}                     #开头结尾都是自己了，标题就也当成是这个来处理了

                    except Exception as e:
                        print(e)           #https://xw.qq.com/a/house/20180928003713  对付这个网页的情况
                        title,Hcontent,Tcontent,Acontent="","","",""   #这样就可以拉
                        return  title, Hcontent, Tcontent, Acontent#直接不要这个url的内容了

                # print(p)
                if p['type'] == 1:                                       # 这个是一个句子，那就直接插进去就可以了
                    # 当成句子插入
                    pContent = p['value']                                   #提取出句子来作为首段尾端还有   中间的段落
                    # print("长度有")
                    # print(pContent.split("。"))
                    # print(len(pContent.split("。")))
                    try:
                        Tcontent = "<p>"+pContent.split("。")[-2] +"</p>"           #最后一句作为 结尾的句子，句号前面那个才是
                    except Exception as e:
                        Tcontent="<p>"+pContent.split("。")[0] +"</p>"   #无法分的话，比如一句话，那就头尾都一样把
                    Hcontent = "<p>"+pContent.split("。")[0] +"</p>"            #这儿是开头的第一句的句子  ,
                    simplePList.append(pContent)                                 #整个把，没办法了饿

                    phtml = '<p>' + pContent + "</p>"
                    Acontent = Acontent + phtml
                    # print(phtml)

                if p['type'] == 2:
                    imgSrc = p['value'].replace("\/", "/")  # 图片的真实下载地址
                    # print(imgSrc)
                    imgName = imgSrc.replace("https://inews.gtimg.com/", "").replace("/", "_")
                    now = time.strftime('%Y%m%d', time.localtime(time.time()))
                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    imgName = now_date + self.stripImgUrl(imgName)

                    # print("文件名是" + imgName)  # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                    downloadTool.downloadImg(imgSrc, imgName=imgName, referer=None, now_date=now)


                    # now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    # imgName = now_date + self.stripImgUrl(imgName)
                    imgPScr = "/images/" + imgName + ".jpg"  # 这个html格式图片地址

                    # print(imgPScr)
                    HtmlImg = '<p><img src="' + imgPScr + '"/></p>'
                    Acontent = Acontent + HtmlImg
                    # print("这个是图片了 " + HtmlImg)

                if p['type'] == 3  :                  #只有一个视频的时候，解析视频
                    pContent = title
                    # print(pContent)

                    #解析视频
                    vid = self.findVideo(tempDic)
                    rawVideoString =""
                    if vid!=None:
                        rawVideoString='<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid='+vid+'" allowFullScreen="true"></iframe></p>'


                    #只有一个句子的东西是没有这个索引值的
                    # if dic.index(p) == 0:  # 这种情况就是只有一个视频的网页，标题就是唯一的开头了，那结尾呢，已经不能分了把，视频又没找到
                    #     print("find the header p")
                    #     print(pContent)
                    Hcontent = pContent                  #头是描述
                    Tcontent = rawVideoString            #尾是视频作为一个独立的一个段落
                #插入数据库先
                for p in simplePList:
                    self.dbhelper.insertSimpleP(p)  #插入段落，但是没有更新标题还有那些东西
                Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
                return title, Hcontent, Tcontent, Acontent                  #返回了后就可以写入数据库了把
            # else:   #这儿是真的只有一个句子的时候
            #     pass



    def getPageContentMain(self,sportsUrl):
        print(sportsUrl)

        title, Hcontent, Tcontent, Acontent = self.getWxContent(self.changeNewToWx(sportsUrl))  #直接打开手机端的getWxContent这儿

        print(title)
        # print(Hcontent)
        # print(Tcontent)
        # print(Acontent)
        return title, Hcontent, Tcontent, Acontent

Example #14

Show file

#这个是用来分类整理进入django的数据库的。
# newssentimentanalysis_homenews   这个是示范的名字，分发到不同的表里面就可以了
from DBcontrol import DB

chak = DB()
# chak.getAllTitle()
# chak.saveDicToMysql(testDic,"2019-03-18","tengxun")
# chak.insertTengxunTheme("www", "2018-232", "test", "auto")  # todo 爬完再执行去重。
resultDic = chak.__query__(
    "select url,title,urlState,Hcontent,Mcontent,Tcontent,Acontent,newdate,fromWhere from tengxun where urlState='True'"
)
print(resultDic)
print(len(resultDic))
#这次我并没有更新他们，更新他们之前是在everyday那儿进行处理的，把信息和urlstarte一起更新进去
print("开始分类整理")
for rowDic in resultDic:
    print(rowDic)  #七个分类 newssentimentanalysis_caranalysis_comment
    sql = ""
    sqlHead = "insert into newssentimentanalysis_"  #插入分类新闻主表的sql
    sqlTail = "news (url,Title,UrlState,Hcontent,Mcontent,Tcontent,Acontent,Date,fromWhere)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"

    sql2 = ""
    # sql2Head ="insert into newssentimentanalysis_"   #插入新闻正文评分表的sql
    sql2Tail = "analysis_news(Pos_Score,Neg_score,Sentiment,News_id_id)values (%s,%s,%s,%s)"  #这个是sql的
    if rowDic['url'].find(
            'auto'
    ) != -1:  #找到这个就是汽车,中间是表名  newssentimentanalysis_entertainmentanalysis_news
        sql = sqlHead + "car" + sqlTail
        sql2 = sqlHead + "car" + sql2Tail
        pass
    if rowDic['url'].find('tech') != -1:  #找到这个就是科技

Example #15

Show file

            url[0])
        if (title != "" and Hcontent != ""):
            datehelper.updateContent(url[0], title, Hcontent, Tcontent,
                                     Acontent)
            mixNews = MixNews()
            state = mixNews.__startToMix__()
            if (state):  # 里面已经有那种写入数据库的操作了
                datehelper.updateState(url[0])


if __name__ == "__main__":  #这个就是url的东西
    fillTengxun = pageContent()
    fillFenghaung = fenghuangPageContent()
    fillWangyi = wangyiPageContent()

    datehelper = DB()

    # tengxunUrls = datehelper.__query__('select url from tengxun where fromWhere ="tengxun" and url!="" and isNull(title) and urlState="False";')
    # fenghuangUrls = datehelper.__query__('select url from tengxun where fromWhere ="fenghuang" and url!="" and isNull(title) and urlState="False" ;')
    wangyiUrls = datehelper.__query__(
        'select url from tengxun where fromWhere ="wangyi" and url!="" and isNull(title) and urlState="False" ;'
    )

    # print(len(tengxunUrls))
    # print(len((fenghuangUrls)))
    print(len((wangyiUrls)))

    # tengxun = myThread(tengxunFill,tengxunUrls,"tengxun")
    # tengxun.run()
    #
    wangyi = myThread(wangyiFill, wangyiUrls, "wangyi")

Example #16

Show file

#从三个数据库里面的各种字段里面，已经有的还是需要修改的，真是的，没办法了
from DBcontrol import DB

dbhelp = DB()

Example #17

Show file

 def saveListToMysql(self, lists, date,fromWhere):
     connect = DB()
     lists = list(set(lists))
     for i in lists:
         connect.insertTenxun(i, date,fromWhere)
         print("所有插入完毕")

Example #18

Show file

File: everyDayTengxun.py Project: realzhengyiming/LocalNews

    def getEveryTengxun(self):  #这个其实就是封装在对象里面的一个主函数而已
        #开始之前，先检查有没有序列化的文件在这儿
        dbhelper = DB()
        # todo 提取页面失败得这几个可以研究一下  是tuple   网址不同，（网址不同，发布时间的不同，）
        # {'type': 2, 'value': 'http:\\/\\/inews.gtimg.com\\/newsapp_match\\/0\\/5261922136\\/0'}
        # 纪念品牌20周年 smart Forease官图发布
        # 打开页面提取失败,可能是页面为404腾讯，删除这条url
        # 删除成功哈 http:////auto.qq.com//a//20181005//001598.htm
        # http:////news.qq.com//a//20181005//002590.htm
        # qqnews
        # 2
        pcontent = pageContent()
        # print("共提取到新闻url的数量有")
        # now_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))  # 获取当前日期,每次执行操作的时候都这样
        now_date = (date.today() + timedelta(days=-1)).strftime(
            "%Y-%m-%d")  # 昨天日期
        # time.localtime(time.time())  #暂时是这样，以后的话
        print("昨天的日期是" + now_date + "现在正在爬取昨天的新闻!d😀")
        #1.获取一天的新url

        #爬取昨晚的
        dateUrl = DateUrl(
        )  # 2018-09-27 日编辑  todo 这儿区分开来，不用通过这儿返回的，另外那儿只需要把那些urlState="False"的提取出来就可以
        dateUrl.pageUrlMain(now_date)  #获得今天的，并且写入数据库  todo 待会要把这儿的这个调回去

        todayNewUrl = dbhelper.__query__(
            "select url from tengxun where urlState='False' and fromWhere='tengxun'"
        )  #只要数据库中取出需要读取的url
        # print(type(todayNewUrl))
        print(len(todayNewUrl))
        print("")
        # 这儿才是把东西提取出来
        count = 1  #计数，每100个就休息1分钟
        for dic in todayNewUrl:  #这儿还是需要把
            url = dic['url']  #2.把写入数据库的这几个新闻url的内容提取出来
            if count % 200 == 0:  #突然的中断应该是因为这边连接不上那儿，所以应该问题不大
                time.sleep(60 * 2)  #每两百个休息4分钟好了
            count += 1

            #这儿的url是未转换成xw。电脑原版页面的url，所以，存的是这种url
            #还是得把这个url打开，才知道是否是title已经存在的

            title, Hcontent, Tcontent, Acontent = pcontent.getPageContentMain(
                url)  #这儿漏了更新到url中去  ,自动转换成xw的然后再下载
            time.sleep(1)
            # print(title, Hcontent, Tcontent, Acontent)
            if (title != "腾讯没找到标题" and title != None
                    and Hcontent != ""):  #有内容的时候就更新这条数据
                # print("要更新的url是 "+url)
                resultState = dbhelper.updateContent(url, title, Hcontent,
                                                     Tcontent,
                                                     Acontent)  #要删除的是更新失败的那个
                if resultState == False:  #更新成功
                    print("更新失败，正在删除这个url不同，但是标题相同的新闻")
                    print(url)
                    dbhelper.deleteUrl(url)  # 按url把这条记录删除掉咯,生成失败也不需要删除这个拉，
                    print()
                else:
                    # print("正在生成新混合新闻。。。")  # 3. 然后是把页面页写入数据库，再然后是随机生成相同数量的
                    mixNews = MixNews()
                    if mixNews.__startToMix__() != True:  # 调用一次就执行一次，可以修改返回的状态
                        print("生成失败，已经没有刚填满的未用过的文章了")
                        print(url)
                        dbhelper.deleteUrl(
                            url)  # 如何这个内容为空也要删除，（可能前面一个步骤更新的时候发现相同的标题，所以插入不了），
                        # print()
            else:
                print("打开页面提取失败,可能是页面为404腾讯，删除这条url")  #为空的话，那么就删除这条把
                dbhelper.deleteUrl(url)  #按url把这条记录删除掉咯

Example #19

Show file

File: fenghuangMain.py Project: realzhengyiming/LocalNews

#coding=utf-8
#
'''
1.这儿是整合起来跑的，首先是数据库提取出要爬取内容的网页的链接
2.然后就是把使用pageContent 这个东西提取出来，并且自动本地化图片
3.然后就是把文章的段落随机组合成不少于800字的一个文章
#只是用来填充基础库用的
'''
from DBcontrol import DB
from fenghuang.fenghuangPageContent import fenghuangPageContent

datahelper = DB()
URLandDate = datahelper.getLimitUrl(0, 1000,
                                    "fenghuang")  #到时候可以设定好一个确切的整数，然后再开始遍历划分
print(URLandDate)  #这儿取出来后，之后又和一般的那个爬虫是一样的了，设置一定的频率
page = fenghuangPageContent()
for url, date in URLandDate:
    title, Hcontent, Tcontent, Acontent = page.getPageContent(url)
    print(Acontent)
    datahelper.updateContent(url, title, Hcontent, Tcontent,
                             Acontent)  #包括一整排你的完整文章还有拆分开来的两部分头尾
    # datahelper.i

    # time.sleep(0.5)
print("写入100内容完毕")

#这个之后才可以随机的组装出数据
# mixP

Example #20

Show file

File: checkProcess.py Project: realzhengyiming/LocalNews

            return False
    except:
        print("Check process ERROR!!!")
        return False


def readfile(tfile):
    with open(tfile, 'r') as f:
        lines = f.readlines()
        return lines[-20:]


#a;lskdjf a;f dsf;ds f;f f

if __name__ == "__main__":
    dbhelper = DB()
    tempNumber = len(
        dbhelper.__query__("select * from c_title"))  # 先检测一下多少，变了多少
    email = EMail()
    while (1):
        if isRunning("python everydaynews.py"):
            print("程序还在运行中。。。12小时后继续检查")
            nonNumber = len(
                dbhelper.__query__("select * from c_title"))  # 先检测一下多少，变了多少
            print(str(nonNumber - tempNumber))
            face = "🤔我该以什么表情来表示呢,如果不是崩了，那就待机中，6小时后我再来看看"
            if (nonNumber - tempNumber) > 800:
                face = "\n🤣今天的量还不错😘"
            elif ((nonNumber - tempNumber)) > 600 and (
                (nonNumber - tempNumber)) <= 800:
                face = "🤗今天的量还算正常哈"

Example #21

Show file

 def __init__(self):
     self.dbhelper = DB()

Example #22

Show file

class pageContent:
    def __init__(self):
        self.dbhelper = DB()

    def stripImgUrl(self, replacedSrc):
        if (replacedSrc.find(":") != -1):
            replacedSrc = replacedSrc.replace(":", "")
        if (replacedSrc.find("：") != -1):
            replacedSrc = replacedSrc.replace("：", "_")
        if (replacedSrc.find(".") != -1):
            replacedSrc = replacedSrc.replace(".", "_")
        if (replacedSrc.find("/") != -1):
            replacedSrc = replacedSrc.replace("/", "_")
        if (replacedSrc.find("-") != -1):
            replacedSrc = replacedSrc.replace("-", "_")
        if (replacedSrc.find("?") != -1):
            replacedSrc = replacedSrc.split("?")[0]
        if (replacedSrc.find("？") != -1):
            replacedSrc = replacedSrc.replace("？", "_")
        if (replacedSrc.find("！") != -1):
            replacedSrc = replacedSrc.replace("！", "_")
        if (replacedSrc.find("\"") != -1):
            replacedSrc = replacedSrc.replace("\"", "_")
        if (replacedSrc.find(" ") != -1):
            replacedSrc = replacedSrc.replace(" ", "")
        if (replacedSrc.find("“") != -1):
            replacedSrc = replacedSrc.replace("“", "")
        if (replacedSrc.find("”") != -1):
            replacedSrc = replacedSrc.replace("”", "")
        if (replacedSrc.find("：") != -1):
            replacedSrc = replacedSrc.replace("：", "")
        if (replacedSrc.find("|") != -1):
            replacedSrc = replacedSrc.replace("|", "_")
        return replacedSrc

    def findVideo(self, dic):  #传入解析到一半的
        if dic.find("ext_data: ") != -1:  #有视频的时候才来解析这个东西
            try:
                if dic.find("vid") != -1:  #找到这个字段
                    # dic = dic.replace('"',"'").replace(",","")
                    dic = dic.split("ext_data: ")[1].split("fisoriginal")[0]
                    vid = dic.split("vid")[1].split(",")[0]
                    vid = vid.split('"')[2]
                    # print(vid)
                    return vid
            except Exception as e:
                print(e)
                return
            return dic
        else:
            return

    def fixUrl(self, url):
        url = url.replace("////", "//")
        # print(url)
        return url

    def changeNewToWx(self,
                      url):  #把链接转化成手机端的，微信的，然后返回合成的微信的链接，或者直接通过请求头来自动跳到手机端
        #提取到的两杠的要换成一杠才可以的。
        # print(url)
        if url.find("////") != -1:
            url = self.fixUrl(url)
            # print(url)
        rawWxUrl = "https://xw.qq.com/"
        if url.find("qq.com/") != -1:
            splitUrl = url.split(".qq.com/")
            tailUrl = splitUrl[1].replace(".htm", "")
            headUrl = splitUrl[0]  #为了提取出主题theme
            if headUrl.find("//") != -1:  #提取主题
                theme = headUrl.split("//")[1]
                tailUrl = tailUrl.split(
                    "//")[0] + "/" + theme + "/" + tailUrl.split(
                        "//")[1] + tailUrl.split("//")[2]
            else:
                return
            sumUrl = rawWxUrl + tailUrl
            # print(sumUrl)
            # print(rawWxUrl)
            # print(tailUrl)
            if len(sumUrl.split("//")) > 1:  #为1就没问题
                tempUrl = sumUrl.split("//")
                totalUrl = tempUrl[0] + "//" + "/".join(tempUrl[1:])
                return (totalUrl)
            return sumUrl

    def getWxContent(self, wxUrl,
                     crawlDate):  #  先提取出里面的那个，然后再看看什么情况  ----这个是主要解析的函数
        #把前面传过来的时间字符串作为时间就好了，这样好一点。
        now = crawlDate.replace("-", "")

        time.sleep(1)
        title, Hcontent, Tcontent, Acontent = "", "", "", ""  # 最后一个参数好像没什么用
        # downloadTool = Download(r'/home/default/images')  # 也可以手动设置新下载路径。
        downloadTool = Download(None)  # 设置下载路径,设置为None就是使用默认值
        simplePList = []
        Cooker = makeBS()
        BS = Cooker.makesoup(wxUrl)  #传进来是微信的才可以一
        if BS == None:
            return title, Hcontent, Tcontent, Acontent
        # print(BS)
        try:
            title = BS.find("head").title  #直接选用head这儿的title标签里面的东西
        except Exception as e:
            print(e)
            traceback.print_exc()  # 貌似这个，一个错，各个错。
            return title, Hcontent, Tcontent, Acontent

        if (title != None and title.text != "404 - 腾讯网"):
            title = title.text
            if self.dbhelper.ifExists(
                    title):  #如果标题为空那就直接删除这条了。所以就是因为已经存在，所以后面都不进行读取了对吧
                print("已经存在了这个")
                return title, Hcontent, Tcontent, Acontent  #存在的话，就不用再解析和下载图片了  ,如果只有标题没有别的，很可能是这个新闻标题已经存在
        else:
            print("此新闻可能已经被删除，提取失败")
            return title, Hcontent, Tcontent, Acontent

        dicString = ""
        for script in BS.find_all("script", attrs={"async":
                                                   False}):  # 这边应该找内容不为空的
            if script.text != None and script.text != "":
                print(script.text)
                dicString = script.text
                break

        # print(dicString)
        dic = dicString.replace("var globalConfig =", "").replace(";", "")
        # print(dic)
        tempDic = dic
        print("解析的文章的部分-》")
        # print(tempDic)
        if dic.find("contents: ") != -1:

            datalist = dic.split("contents: ")[1].split("ext_data")[0].replace(
                "[", "").replace("],", "")
            # print("这边开始这样")
            #这个是新加的。
            # print(datalist)

            try:
                dic = eval("(" + datalist + ")")  #因为加了这个才能转化成那样,这个应该也是没问题才对。
                # print(dic)
            except Exception as e:
                print("转化成json出错")
                print(e)
                traceback.print_exc()  # 貌似这个，一个错，各个错。
                return title, Hcontent, Tcontent, Acontent  #存在的话，就不用再解析和下载图片了

            checkLen = len(dic)
            # pprint.pprint(dic)
            # print(checkLen)  #这儿需要考虑只有一个句子的情况，这个情况下是长度为2，这个情况下不是列表了，只是一个字典
            if (checkLen > 2):
                for p in dic:  #遍历每一个，然后尽心判断应该也是没问题才对的。
                    try:  #traceback.print_exc()
                        if p['type'] == 1:  # 这个是一个句子，那就直接插进去就可以了
                            # 当成句子插入
                            pContent = p['value']
                            phtml = '<p>' + pContent + "</p>"
                            Acontent = Acontent + phtml
                            if dic.index(p) == 0:  #如果发现索引值是第一个的话，那就是开头了
                                Hcontent = phtml
                            elif dic.index(
                                    p) == checkLen - 1:  #同理如果是最后一个句子，那么你就是结尾了
                                Tcontent = phtml
                            else:  # 不是首段和尾端的端口才加入到零散的段落中去
                                simplePList.append(phtml)

                        if p['type'] == 2:
                            imgSrc = p['value'].replace("\/", "/")  #图片的真实下载地址
                            # print(imgSrc)
                            imgName = imgSrc.replace(
                                "https://inews.gtimg.com/",
                                "").replace("/", "_")
                            # now = time.strftime('%Y%m%d', time.localtime(time.time()))   #默认就是字符串，时间需要和url中的时间对应上才可以啊。

                            imgName = self.stripImgUrl(
                                imgName)  #这个就只传文件名好了，如果出了问题的话就用上面的那个。

                            # print("文件名是" + imgName)                                # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                            downloadTool.downloadImg(imgSrc,
                                                     imgName=imgName,
                                                     referer=None,
                                                     now_date=now)
                            # imgName = now +"/"+ self.stripImgUrl(imgName)   #这儿已经是添加了时间的 了
                            # src 参考这儿 /static/images/3.jpg  /static/images/20190309/3.jpg
                            imgPScr = "/static/images/" + now + "/" + imgName + ".jpg"  # 这个html格式图片地址
                            HtmlImg = '<p><img src="' + imgPScr + '"/></p>'
                            # time.sleep(60)
                            # print(imgPScr)

                            Acontent = Acontent + HtmlImg
                            # print("这个是图片了 "+HtmlImg)

                            if dic.index(p) == 0:  #如果发现索引值是第一个的话，那就是开头了
                                # print("find the header p")
                                # print(HtmlImg)
                                Hcontent = HtmlImg
                            elif dic.index(
                                    p) == checkLen - 1:  #如果是最后一个句子，那么你就是结尾了
                                Tcontent = HtmlImg
                            else:  # 不是首段和尾端的端口才加入到零散的段落中去
                                simplePList.append(HtmlImg)

                        if p['type'] == 3:  # 这个是视频的情况，直接提取出来当成句子好了，这儿还有图片的，先不管了
                            try:
                                pContent = p['value']['desc']
                                pContent = "<p>" + pContent + "</p>"
                                # 解析视频
                                vid = self.findVideo(tempDic)
                                rawVideoString = ""
                                if vid != None:
                                    rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>'

                                if dic.index(p) == 0:  # 如果发现索引值是第一个的话，那就是开头了
                                    # print("find the header p")
                                    # print(pContent)
                                    Hcontent = pContent + rawVideoString
                                elif dic.index(
                                        p
                                ) == checkLen - 1:  # 如果是最后一个句子，那么你就是结尾了
                                    Tcontent = pContent + rawVideoString
                                else:  #不是首段和尾端的端口才加入到零散的段落中去
                                    simplePList.append(pContent)

                            except Exception as e:
                                pass  #舍弃这个段落
                    except Exception as e:
                        print(e)
                        traceback.print_exc()  #貌似这个，一个错，各个错。
                #插入数据库先
                # for p in simplePList:
                #     self.dbhelper.insertSimpleP(p)  #插入段落，但是没有更新标题还有那些东西

                Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
                return title, Hcontent, Tcontent, Acontent

            else:  #这儿是只有一个句子的网页的情况下  ，这边的一个大段落的，首段尾段的拆分是没问题的
                p = dic  #这样转换一下就可以了
                # print(type(dic))
                # print(dic)
                # print(p)
                if type(p) == tuple:
                    print("是tuple")
                    try:
                        # print("长度是")
                        # print(len(p))
                        if (len(p) == 1):  #又加了一种，怕不够长的处理
                            p = p[0]  # 如果是里面只有一个的话，那就提取出里面的一个元素就可以了，
                        if (len(p) == 2):
                            p = p[1]  # 如果是里面只有一个的话，那就提取出里面的一个元素就可以了，
                        else:  #长度为空，里面除了标题，连文字内容都没有的这种。。。
                            p = {'type': 3}  #开头结尾都是自己了，标题就也当成是这个来处理了

                    except Exception as e:
                        print(
                            e
                        )  #https://xw.qq.com/a/house/20180928003713  对付这个网页的情况
                        title, Hcontent, Tcontent, Acontent = "", "", "", ""  #这样就可以拉
                        return title, Hcontent, Tcontent, Acontent  #直接不要这个url的内容了

                # print(p)
                if p['type'] == 1:  # 这个是一个句子，那就直接插进去就可以了
                    # 当成句子插入
                    pContent = p['value']  #提取出句子来作为首段尾端还有   中间的段落
                    # print("长度有")
                    # print(pContent.split("。"))
                    # print(len(pContent.split("。")))
                    try:
                        Tcontent = "<p>" + pContent.split(
                            "。")[-2] + "</p>"  #最后一句作为 结尾的句子，句号前面那个才是
                    except Exception as e:
                        Tcontent = "<p>" + pContent.split(
                            "。")[0] + "</p>"  #无法分的话，比如一句话，那就头尾都一样把
                    Hcontent = "<p>" + pContent.split(
                        "。")[0] + "</p>"  #这儿是开头的第一句的句子  ,
                    simplePList.append(pContent)  #整个把，没办法了饿

                    phtml = '<p>' + pContent + "</p>"
                    Acontent = Acontent + phtml
                    # print(phtml)

                if p['type'] == 2:
                    imgSrc = p['value'].replace("\/", "/")  # 图片的真实下载地址
                    # print(imgSrc)
                    imgName = imgSrc.replace("https://inews.gtimg.com/",
                                             "").replace("/", "_")
                    # now = time.strftime('%Y%m%d', time.localtime(time.time()))# todo 这个时间要减少一天才对。
                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    # imgName = now_date + self.stripImgUrl(imgName)   #这儿已经是添加了时间的 了
                    imgName = self.stripImgUrl(
                        imgName)  # 这个就只传文件名好了，如果出了问题的话就用上面的那个。

                    # print("文件名是" + imgName)  # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                    downloadTool.downloadImg(imgSrc,
                                             imgName=imgName,
                                             referer=None,
                                             now_date=now)

                    # now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    # imgName = now_date + self.stripImgUrl(imgName)
                    imgPScr = "/static/images/" + now + "/" + imgName + ".jpg"  # 这个html格式图片地址

                    # print(imgPScr)
                    HtmlImg = '<p><img src="' + imgPScr + '"/></p>'
                    Acontent = Acontent + HtmlImg
                    # print("这个是图片了 " + HtmlImg)

                if p['type'] == 3:  #只有一个视频的时候，解析视频
                    pContent = title
                    # print(pContent)

                    #解析视频
                    vid = self.findVideo(tempDic)
                    rawVideoString = ""
                    if vid != None:
                        rawVideoString = '<p><iframe frameborder="0" src="https://v.qq.com/txp/iframe/player.html?vid=' + vid + '" allowFullScreen="true"></iframe></p>'

                    #只有一个句子的东西是没有这个索引值的
                    # if dic.index(p) == 0:  # 这种情况就是只有一个视频的网页，标题就是唯一的开头了，那结尾呢，已经不能分了把，视频又没找到
                    #     print("find the header p")
                    #     print(pContent)
                    Hcontent = pContent  #头是描述
                    Tcontent = rawVideoString  #尾是视频作为一个独立的一个段落
                #插入数据库先
                # for p in simplePList:
                #     self.dbhelper.insertSimpleP(p)  #插入段落，但是没有更新标题还有那些东西
                Tcontent = "".join(BeautifulSoup(Acontent, 'lxml').text)
                return title, Hcontent, Tcontent, Acontent  #返回了后就可以写入数据库了把
            # else:   #这儿是真的只有一个句子的时候
            #     pass

    def getPageContentMain(self, sportsUrl, crawlDate):
        print(sportsUrl)
        title, Hcontent, Tcontent, Acontent = self.getWxContent(
            self.changeNewToWx(sportsUrl), crawlDate)  #直接打开手机端的getWxContent这儿
        print(title)
        print(Hcontent)
        print(Tcontent)
        print(Acontent)
        return title, Hcontent, Tcontent, Acontent

Example #23

Show file

File: pageUrls.py Project: realzhengyiming/LocalNews

 def __init__(self):
     self.dbhelper = DB()  #默认就给你创建好了，

Example #24

Show file

    def getEveryTengxun(self):
        dbhelper = DB()  #处理数据库用
        pcontent = pageContent()  #处理页面详情用

        now_date = (date.today() + timedelta(days=-1)).strftime(
            "%Y-%m-%d")  # 昨天日期
        print("昨天的日期是" + now_date + "现在正在爬取昨天的新闻!d😀")  #应该是获得昨天才对

        #------------------------------------------------爬取昨晚的-----------------------------------------------------
        print("开始执行写入所有的url")
        dateUrl = DateUrl(
        )  # 2018-09-27 日编辑  todo 这儿区分开来，不用通过这儿返回的，另外那儿只需要把那些urlState="False"的提取出来就可以
        dateUrl.pageUrlMain(now_date)  #获得今天的，并且写入数据库  ，所以这儿返回什么都没关系，不需要返回都可以的

        #-------------------------------------------------打开内容------------------------------------------------------
        print("开始执行读取页面")
        todayNewUrl = dbhelper.__query__(
            "select url from tengxun where urlState='False' and fromWhere='tengxun'"
        )
        print("读取出 " + str(len(todayNewUrl)) + " 条")
        print("")

        #每100个就休息1分钟，慢是有原因的#每两百个休息2分钟好了
        count = 1
        delCount = 0
        for dic in todayNewUrl:
            url = dic['url']
            if count % 200 == 0:
                time.sleep(60 * 2)
                print("休息2分钟")
            count += 1

            # 爬取的当前时间写入进去。
            title, Hcontent, Tcontent, Acontent = pcontent.getPageContentMain(
                url, now_date)  #这儿漏了更新到url中去  ,自动转换成xw的然后再下载
            time.sleep(1)

            if (title != "腾讯没找到标题" and title != None
                    and Hcontent != ""):  #有内容的时候就更新这条数据

                # todo 这儿加上生成云图保存本地，并且把路径合并成src生成字符串合并到Acontent就可以了。
                # 生成img标签
                News_Id = url.replace("$", "").replace("/", "").replace(
                    ":", "_").replace(".", "_")

                imgTag = "<img src=" + Gen_WordCloud(
                    Newsid=News_Id,
                    text=Acontent) + " />"  #不能使用单引号，否则会让sql语句中断开的
                print(imgTag)
                Acontent = imgTag + Acontent
                print("更新的结果有")
                print(title)
                print(Tcontent)
                print(url)
                print(Acontent)
                print("显示完毕")

                resultState = dbhelper.updateContent(url, title, Hcontent,
                                                     Tcontent,
                                                     Acontent)  #要删除的是更新失败的那个
                if resultState == False:  #更新成功
                    print("更新失败，正在删除这个url不同，但是标题相同的新闻")
                    print(url)
                    dbhelper.deleteUrl(url)  #删除提取失败的那些
                    print()
                else:
                    pass  #更新成功什么都不干
            else:
                delCount += 1
                print("打开页面提取失败,可能是页面为404腾讯，删除这条url")  #为空的话，那么就删除这条把
                dbhelper.deleteUrl(url)  #按url把这条记录删除掉咯
        dbhelper.classifyDB()  # 执行完了后就进行分类到django的数据库

        comment = CommentCrawl()
        comment = CommentCrawl()
        comment.getCommentMain()  #执行了爬取评论并且分类到django数据库
        print("共删除了  " + str(delCount))
        print("原来有  " + str(len(todayNewUrl)) + " 条")
        print("今天爬取完毕，蟹蟹使用")

Example #25

Show file

File: checkSonProcess.py Project: realzhengyiming/LocalNews

            return False
    except:
        print("Check process ERROR!!!")
        return False


def readfile(tfile):
    with open(tfile, 'r') as f:
        lines = f.readlines()
        return lines[-50:]


#a;lskdjf a;f dsf;ds f;f f
# todo 所有的错误提示中加上那个可以确定错误位置的东西
if __name__ == "__main__":
    dbhelper = DB()
    tempNumber = len(
        dbhelper.__query__("select * from c_title"))  # 先检测一下多少，变了多少
    email = EMail()
    timeSleep = 60 * 60 * 6  #这个是发右键的休眠的时间
    while (1):
        now_date = (date.today() + timedelta(days=-1)).strftime(
            "%Y-%m-%d")  # 具体昨天日期的三个东西的结果
        tengxunNumber = len(
            dbhelper.__query__(
                "select * from tengxun where newdate='%s' and fromwhere='%s'" %
                (now_date, "tengxun")))  # 先检测一下多少，变了多少
        wangyiNumber = len(
            dbhelper.__query__(
                "select * from tengxun where newdate='%s' and fromwhere='%s'" %
                (now_date, "wangyi")))

Example #26

Show file

class DateUrl:
    def __init__(self):
        self.dbhelper = DB()  #默认就给你创建好了，

    def getDateUrlList(self, startDate, endDate):  #返回这两个日期区间的url,顺便就写入数据库了
        urlList = []
        timehelper = TimeHelper()
        datelist = []
        if (startDate != endDate):  #不相等的时候就算差值
            datelist = timehelper.getTimeList(startDate, endDate)
        else:
            datelist.append(startDate)
        for oneDay in datelist:  #这儿也设置了休眠的
            time.sleep(1.5)  #500毫秒一次，那我设置成800毫秒请求一次
            onedatelist = []
            try:
                onedatelist = self.getOneDayNewUrl(oneDay)
            except Exception:
                time.sleep(30)
                onedatelist = self.getOneDayNewUrl(oneDay)
            urlList = urlList + onedatelist

            # self.saveListToMysql(onedatelist,oneDay,"tengxun")  #存到数据库里面去，把每个都插入进去
        return urlList

    def getOneDayNewUrl(self, date):
        date = parse.quote_plus("" + date)
        oneDayUrlList = []
        print(str(date))
        # date = "2018-07-26"
        appid = "3639833dae924cb9efb6ba30e6c5a6fa"
        url = "https://api.shenjian.io/?appid=" + appid + "&date=" + date
        # print(url)
        request = urllib.request.Request(url,
                                         headers={
                                             "Accept-Encoding": "gzip",
                                         })

        response = urllib.request.urlopen(request)
        gzipFile = gzip.GzipFile(fileobj=response)
        # print(gzipFile.read().decode('UTF-8'))
        jsonResult = json.loads(str(gzipFile.read().decode('UTF-8')))
        if "data" in jsonResult:
            print(jsonResult['data'])
            print("共有多少个新闻" + str(len(jsonResult['data'])))
            if (len(jsonResult['data']) == 4):
                oneDayUrlList.append(jsonResult['data']['url'])
                return oneDayUrlList
            else:
                for i in jsonResult['data']:
                    # print(i['url'])
                    oneDayUrlList.append(i['url'])
                return oneDayUrlList
        else:
            print("检测到腾讯的api 中无  data key 10分钟后再试")
            time.sleep(60 * 10)  #如果一下子那个api没有反应的话，那就这样操作咯，用进程把，多个cpu哦
            return self.getOneDayNewUrl(date)  #采用递归的方式来处理，，

# -----------------------------------------------------下面开始是新的提取出页面的url的-----------------------------------

    def returnThemeCode(self, theme):  #这个是有用的，用来组合主题代码url的
        ent_Theme = 1537876288634
        sport_Theme = 1537877689177
        finance_Theme = 1537878365483
        tech_Theme = 1537879684280
        auto_Theme = 1537887032223
        house_Theme = 1537887128904
        news_Theme = 1537874915062
        if theme == 'news':
            return news_Theme
        if theme == 'ent':
            return ent_Theme
        if theme == 'sports':
            return sport_Theme
        if theme == 'tech':
            return tech_Theme
        if theme == 'auto':
            return auto_Theme
        if theme == 'house':
            return house_Theme
        if theme == 'finance':
            return finance_Theme

    def getThemeUrl(self, theme, today, pageNumber):
        rawUrl = "http://roll.news.qq.com/interface/cpcroll.php"
        rawReferer = '.qq.com/articleList/rolls/'  # 'http://news   前面还有这个东西
        my_headers = [
            'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
            'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)'
        ]
        headers = {
            "User-Agent": random.choice(my_headers),
            'Referer': 'http://' + theme + rawReferer
        }  # 默认值
        rawUrl = rawUrl + "?callback=rollback&mode=1&cata=&_=" + str(
            self.returnThemeCode(theme)) + "&site=" + theme + "&page=" + str(
                pageNumber) + "&date=" + today
        try:
            rawhtml = requests.get(
                rawUrl, headers=headers, allow_redirects=False,
                timeout=30)  # 一般提取文本的话，那就用text，如果是文件就content
            rawhtml.encoding = chardet.detect(rawhtml.content)['encoding']
            # print(rawhtml.url)
            print("状态码" + str(rawhtml.status_code))
            if rawhtml.status_code == 504:
                print(504)
                return
            print("页面的读取结果为")
            # print(rawhtml.text)
            if rawhtml.text.find('rollback') == 0:
                jsonString = rawhtml.text.split("rollback")[1]  # 把js提取出来就可以了
            else:
                jsonString = rawhtml.text
            print(jsonString)
            dicData = eval(jsonString)
            print(type(jsonString))
            print(jsonString)
            # print(dicData['data']['article_info'])
            print(len(dicData['data']['article_info']))
            if dicData['data'] == "":
                print("超过了最大页数了，跳出了就可以了")
                return
            urllist = []
            for one in dicData['data']['article_info']:
                # print(one['url'])
                print(one['url'].replace("\\", "/"))  # 还需要检查一下这个和之前的那种野蛮是不是一样的
                urllist.append(one['url'].replace("\\", "/"))
            return urllist
        except Exception as e:
            # print(e)
            return []

    def pageUrlMain(self, date):  # 写入url进入数据库，并且写入分类
        # url    ="http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537850539512"

        urlNew = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=news&mode=1&cata=&date=2018-09-25&page=1&_=1537874915062"

        urlEnt = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=ent&mode=1&cata=&date=2018-09-25&page=1&_=1537876288634"  # referer = http://ent.qq.com/articleList/rolls/

        urlSport = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=sports&mode=1&cata=&date=2018-09-25&page=1&_=1537877689177"  # r这个好像而是动态加载出来的，真是的

        urlFinance = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=finance&mode=1&cata=&date=2018-09-25&page=1&_=1537878365483"

        urlTech = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=tech&mode=1&cata=&date=2018-09-25&page=2&_=1537879684280"

        urlAuto = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=auto&mode=1&cata=&date=2018-09-25&page=1&_=1537887032223"

        urlHouse = "http://roll.news.qq.com/interface/cpcroll.php?callback=rollback&site=house&mode=1&cata=&date=2018-09-25&page=1&_=1537887128904"

        resultUrlDic = {}  #写入数据库使用这个
        tempList = []
        themeList = [
            'news', 'ent', 'tech', 'auto', 'house', 'finance', 'sports'
        ]  #一共有7个主题，其实不止这7个的

        for theme in themeList:
            print("第一个主题是")
            tempDList = []
            for i in range(1, 12):  # 一般是10页就很多的了。10页以内
                print("第" + str(i) + "页")
                responseList = self.getThemeUrl(theme, date, i)
                if len(responseList) == 0:
                    print("最大页数为" + str(i - 1) + "页")
                    break
                else:
                    tempList = tempList + responseList
                    tempDList += responseList
            resultUrlDic[theme] = tempDList
            print(resultUrlDic)
        tempList = set(tempList)
        count = 0
        print("列表的url数量有：" + str(len(tempList)))
        for key in resultUrlDic:
            count += len(resultUrlDic[key])
        print("url总共有" + str(count))

        print("这个是PageUrls内的提取到的url")
        pprint(resultUrlDic)
        print(len(resultUrlDic))

        print("这个开始是list类型的结果")
        print(tempList)

        self.dbhelper.saveDicToMysql(resultUrlDic, date,
                                     "tengxun")  #参数，字典结果集，时间，分类
        return tempList  #直接这儿去重后

Example #27

Show file

File: fenghuangPageContent.py Project: realzhengyiming/LocalNews

    def getNewsContent(
            self,
            url):  # 打开news.ifenghxxx的东西，只要打开文章后，然后解析出 首段，尾段，还有中间的普通段落就可以了
        title, Hcontent, Tcontent, Acontent = "", "", "", ""
        t = time.time()
        timeStamp = str(int(round(t * 1000)))  # 毫秒级时间戳
        time.sleep(1)
        print("现在网页是：" + url)
        downloadTool = Download(r'/home/default/images')  # 设置下载路径
        dbhelper = DB()

        title, Hcontent, Tcontent, Acontent = "", "", "", ""
        simpleP = []

        soup = makeBS().makesoup(url)
        if soup == None:
            return title, Hcontent, Tcontent, Acontent

        try:
            title = soup.find("head").title
            if dbhelper.ifExists(title):  #如果找到已经存在那就不用再写的拉
                return title, Hcontent, Tcontent, Acontent  #存在的话，就不用再解析和下载图片了
        except Exception as e:
            #一般提示没找到都是
            return title, Hcontent, Tcontent, Acontent  #也当成没有处理掉就可以了

        # print(title)
        # print("标题是")
        if title != None:  # 如果找到的话
            title = title.text.split("_")[0]
            # title=self.fixCssdot(title)
            # print(title)
        else:
            print("没能找到标题，请检查网址 " + url)
            # print(soup)
            return title, Hcontent, Tcontent, Acontent  # 不行就返回空的回去

        flag = False  # 遇到之前先不写入
        pList = []
        simplePList = []  # 这个是

        for p in soup.find_all(
                "p"
        ):  #找到p就输出来就可以了        你好啊，大哥，我是你的第一个机械硬盘，感觉还行哈，灯光效果怎么样，不怎么会   # print(p.text)
            try:
                if p['class'][0] == "p_time":
                    flag = True
                    continue  #跳过这个的下面的，继续走
                if p['class'][0] == "detailPic":

                    #这儿找到后就下载图片，并且，修改src然后写入进去，就是那个模块了， todo 有空变成模块
                    imgSrc = p.img['src']
                    imgName = imgSrc.replace("https://inews.gtimg.com/",
                                             "").replace("/", "_")
                    now = time.strftime('%Y%m%d', time.localtime(time.time()))
                    now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    imgName = now_date + self.stripImgUrl(
                        imgName)  # 这儿已经是添加了时间的 了

                    # print("文件名是" + imgName)                                # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                    downloadTool.downloadImg(imgSrc,
                                             imgName=imgName,
                                             referer=None,
                                             now_date=now)

                    # now_date = now + "/"                                     # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠
                    # imgName = now_date + self.stripImgUrl(imgName)   #已经处理好了
                    imgPScr = "/images/" + imgName + ".jpg"  # 这个html格式图片地址

                    #传进来是img的url下载并且修改后当成普通的段落进行处理
                    pList.append('<img src=' + imgPScr + '/>')  # 统一起来把，后面会有加东西
                    # print("<p><img src='"+p.img['src']+"'/></p>")
                if p.text == "用微信扫描二维码分享至好友和朋友圈":
                    flag = False  #找到了这个就是到结尾了
                    pass
            except Exception as e:
                pass
                if flag:
                    # print("正在添加")
                    pList.append(p)
                else:
                    # print("停止添加这一条")
                    break
        # print(len(pList))  # 这个最后一个句子是尾句，
        Plen = len(pList)
        if (len(pList) == 1):
            # 一个句子的话，首段和尾端都统一起来好了
            print("这个文章只有一个句子 " + url)
            return title, pList[0], pList[0], pList[0]  # 这样的话，就不插入

        PPlist = []
        continueFlag = False
        for pOne in pList:
            try:
                p = pOne.text
            except Exception:  #那就是图片的那个了，我知道是什么来的
                p = pOne
            # print(pOne)
            if (p != ""):
                # print(p.strip("\n"))
                if p.strip(
                        "\n"
                ) == "用微信扫描二维码分享至好友和朋友圈" and continueFlag == False:  #有些有有些没有的
                    # print("找到第一个这个东西")
                    # continue
                    continueFlag = True
                elif p.strip(
                        "\n"
                ) == "用微信扫描二维码分享至好友和朋友圈" and continueFlag == True:  #有些有有些没有的
                    # print("这儿是结束的地方")
                    continueFlag = "break"
                if continueFlag == True:
                    if (p != "用微信扫描二维码分享至好友和朋友圈"):
                        p = "<p>" + p + "</p>"
                        #这儿是把"'"  这个东西替换成'"'   这个东西
                        # if p.find("'")!=-1:
                        #     print("找到了这个东西")

                        PPlist.append(p)
                if continueFlag == 'break':
                    break

            else:
                pass  #图片都会在前面进行处理才对的
                #检查一下是不是图片的
                # print("文字卫空空那")
                # if pOne.find("img")!=None:  #因为图片是处理过的了，那么就直接加入进去就可以了
                #     print(p)

        # print(PPlist)

        if (len(PPlist)) == 1 and len(PPlist) == 2:  #还有2都是这种情况
            Hcontent = PPlist[0]
            Tcontent = PPlist[0]
            Acontent = PPlist[0]
        if (len(PPlist)) > 2:
            Hcontent = PPlist[0]
            Tcontent = PPlist[-1]
            for i in PPlist:
                Acontent = Acontent + i
                # print(i)  # 你好啊
            # Acontent = PPlist

        # print("普通的段落有")
        simplePList = PPlist[1:-1]

        # print(p)
        # print("开头结尾是这个")
        # print(Hcontent)
        # print(Tcontent)
        # print("各个分句子在这儿")
        for simpleP in simplePList:  # 这儿直接写入进去就可以了
            dbhelper.insertSimpleP(simpleP)  # 这儿这个是一样的
            # print(simpleP)
        return title, Hcontent, Tcontent, Acontent

Example #28

Show file

 def saveListToMysql(self, list, date):  #url list写入数据库中去
     connect = DB()
     for i in list:
         connect.insertTenxun(i, date, "wangyi")

Example #29

Show file

File: fenghuangPageContent.py Project: realzhengyiming/LocalNews

    def getPlContent(self, url):  #打开pl.ifenghxxx的东西
        title, Hcontent, Tcontent, Acontent = "", "", "", ""
        t = time.time()
        timeStamp = str(int(round(t * 1000)))  # 毫秒级时间戳
        time.sleep(1)
        print("现在网页是：" + url)
        downloadTool = Download(r'/home/default/images')  # 设置下载路径
        dbhelper = DB()

        title, Hcontent, Tcontent, Acontent = "", "", "", ""
        simpleP = []
        soup = None
        soup = makeBS().makesoup(url)
        if soup != None:
            title = soup.find("head")
        else:
            return title, Hcontent, Tcontent, Acontent

        # print(title)
        # print("标题是")
        if title != None:  # 如果找到的话
            title = title.text.split("_")[0]
            # title=self.fixCssdot(title)
            # print(title)
        else:
            print("没能找到标题，请检查网址 " + url)
            # print(soup)
            # print()
            return title, Hcontent, Tcontent, Acontent  # 不行就返回空的回去

        flag = False  #遇到之前先不写入
        pList = []
        simplePList = []  #这个是
        for p in soup.find_all("p"):
            # print(type(p))
            # print(p)
            try:
                # print(p['class'])
                if p['class'][0] == "f14":
                    # print(p)
                    pList.append(p)
                    flag = True
                if p['class'][0] == "IphoneNone":
                    # print(p['class'])
                    flag = False  #结束跳出
            except Exception as e:
                if flag:
                    # print("正在添加")
                    pList.append(p)
                else:
                    # print("停止添加这一条")
                    break

        print(len(pList))  #这个最后一个句子是尾句，
        Plen = len(pList)

        if (len(pList) == 1):
            #一个句子的话，首段和尾端都统一起来好了
            print("这个文章只有一个句子 " + url)
            return title, pList[0], pList[0], pList[0]  #这样的话，就不插入

        for p in pList:
            # print(p)
            if p.text != "":
                pHtml = "<p>" + p.text + "</p>"
                if pList.index(p) == 0:  # 如果发现索引值是第一个的话，那就是开头了
                    Hcontent = pHtml
                    Acontent = Acontent + pHtml
                    # print("find the header p")
                    # print(phtml)
                elif pList.index(p) == len(pList) - 1:  # 同理如果是最后一个句子，那么你就是结尾了
                    Acontent = Acontent + pHtml
                    Tcontent = pHtml
                else:  # 不是首段和尾端的端口才加入到零散的段落中去
                    Acontent = Acontent + pHtml
                    simplePList.append(pHtml)
            else:  #可能是有图片的这个东西，如果有图片那就这百年这样操作
                if p.find("img") != None:
                    print("发现图片的段落")
                    for img in p.find_all("img"):  #修改图片路径和下载图片这两个操作
                        #修改里面的图片的地址的东西，然后还有别的什么的
                        imgSrc = img['src']
                        now = time.strftime('%Y%m%d',
                                            time.localtime(time.time()))
                        now_date = now + "/"  # 后面下载的文件名是不需要带杠的，后面就不需要带杠杠

                        imgName = self.stripImgUrl(imgSrc)
                        print(imgName)

                        imgName = now_date + self.stripImgUrl(
                            imgName)  # 这儿已经是添加了时间的 了

                        # print("文件名是" + imgName)                                # 这儿下载这个图片到服务器指定的地址上 # 这儿下载这个图片到服务器指定的地址上
                        downloadTool.downloadImg(imgSrc,
                                                 imgName=imgName,
                                                 referer=None,
                                                 now_date=now)

                        imgPScr = "/images/" + imgName + ".jpg"  # 这个html格式图片地址
                        HtmlImg = '<p><img src="' + imgPScr + '"/></p>'

                        if pList.index(p) == 0:  # 如果发现索引值是第一个的话，那就是开头了
                            Hcontent = HtmlImg
                            Acontent = Acontent + HtmlImg

                        elif pList.index(
                                p) == len(pList) - 1:  # 同理如果是最后一个句子，那么你就是结尾了
                            Acontent = Acontent + HtmlImg

                            Tcontent = HtmlImg
                        else:  # 不是首段和尾端的端口才加入到零散的段落中去
                            Acontent = Acontent + HtmlImg

                            simplePList.append(HtmlImg)

        # print("开头结尾是这个")
        # print(Hcontent)
        # print(Tcontent)
        # print("各个分句子在这儿")
        for simpleP in simplePList:  #这儿直接写入进去就可以了
            dbhelper.insertSimpleP(simpleP)  # 这儿这个是一样的
            # print(simpleP)
        return title, Hcontent, Tcontent, Acontent

Example #30

Show file

class CommentCrawl(object):
    def __init__(self):
        self.dbHelper = DB()

    def changTimeToDate(self,dateString):
        timeStamp = dateString
        timeArray = time.localtime(timeStamp)
        print(timeArray)
        otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
        # print(otherStyleTime)
        return otherStyleTime


    def getNewsIdAndUrl(self):   #提取出新闻的id和url
        # dbHelper = DB()
        themeWord = ['car','technology','home','entertainment','house','finance','sports']  #类别新闻
        resultDic = {}
        sqlHead = "select News_id,url from newssentimentanalysis_"
        sqlTail = "news"
        # 插入
        for theme in themeWord:
            print(sqlHead+theme+sqlTail)
            resultDic[theme] = self.dbHelper.__query__(sqlHead+theme+sqlTail)# 返回
        return resultDic  #返回格式{'car':[{'id':xx,'url':xx},.....,'home'...]

    def getAwriteCommentJson(self,id,url):                                        #这个是评论专用的请求返回成字典的。
        time.sleep(1)
        cooker = makeBS()
        commentRawUrl = "http://coral.qq.com/article/"
        cmt_id = cooker.getCmt_id(url)  #去掉空格
        if cmt_id==None:
            return
        if cmt_id.find("'")!=-1:
            cmt_id = cmt_id.replace("'","")
        else :
            cmt_id = cmt_id.strip()

        # print(  cmt_id.strip()  )
        #这个用来拼接用到。
        try:
            allUrl = commentRawUrl + str(cmt_id) + "/comment/#"
            print(allUrl)
            responseDic = cooker.makeBSjson(allUrl)
            # if
            # print()
            print(responseDic)
            commentList = responseDic['data']['commentid']
            print(commentList)
            from pprint import pprint
            for comment in commentList:
                pprint(type(comment['id']))
                print(comment['id'])
                comment['content'] = emoji.demojize(comment['content'])      #过滤emoji
                comment['userinfo']['nick'] = emoji.demojize(comment['userinfo']['nick'])
                comment['time']=self.changTimeToDate(comment['time'])             #时间戳改成日期字符串
                print("新闻id "+ str(id))
                print("新闻的url是 "+ url)


                self.dbHelper.classifyDBComment(url=url,id=id,comment=comment)   #插入数据库。


                print("")
                #-----------------------这儿可以合成sql语句的话就可以执行插入的操作了。-----------------------
                # 通过url来合成插入的sql语句，DBcontrol的方法中来做这些东西
        except Exception as e:
            print("提取此条评论出错，正在跳过")
            print(e)


    def getCommentMain(self):
        resultDic = self.getNewsIdAndUrl()
        print(resultDic)
        from pprint import  pprint
        resultList = []
        count = 0
        for theme in resultDic:
            print("现在是",theme)
            for oneNews in resultDic[theme]:
                count+=1  #这个累加，然后如果是到了一定的数量那就休眠一下
                if count%100==0:  #每100条
                    time.sleep(60*2) #休息两分钟。
                print(oneNews)  #已经提取出来了
                self.getAwriteCommentJson(id=oneNews['News_id'],url=oneNews['url'])   #逐条插入，进行，这个不需要返回
                # resultList.append(oneNews)   # 添加进入
        print("finish comments crawl！")