Example #1
0
def getTopic(loop, url):
    topicHtml = yield from wget(url)
    if len(topicHtml) > 100:
        soup = BeautifulSoup(topicHtml, 'html.parser')
        topic = []
        isImg = False
        m = re.match(r'^http.+/(\d+)', url)
        topicId = m.group(1) if m is not None else 0
        for content in soup.find_all(checkTopic):
            img = content.find(class_="BDE_Image")
            if img is not None and isImg == False:
                isImg = True
                imgUrl = img.get("src")
            topic.append(content.get_text().strip())
        if isImg and len(topic) > 6:
            topicTitle = soup.title.string
            m = re.match(r'^http.+/(.+)', imgUrl)
            imgName = m.group(1)
            logging.info('=============%s==============' % soup.title.string)
            logging.info(url)
            yield from orm.create_pool(loop=loop,
                                       host=dbHost,
                                       user=dbUser,
                                       password=dbPassword,
                                       db=dbName)
            num = yield from WeaponChangeTopic.findNumber(
                'id', 'topicId=?', topicId)
            if num is None:
                words = yield from getImgWords(imgUrl)
                try:
                    wordList = json.loads(words)['retData']
                    if len(wordList) > 0:
                        logging.info(wordList[0]["word"].strip())
                        weaponTitle = wordList[0]["word"]
                        weapon = WeaponChangeTopic(
                            comefrom="tieba",
                            topicId=topicId,
                            title=weaponTitle,
                            details=json.dumps(wordList),
                            topicTitle=topicTitle,
                            topicList=json.dumps(topic),
                            img=imgName)
                        yield from weapon.save()
                except ValueError as e:
                    print("baidu ocr json error: %s" % words)
Example #2
0
def getTopic(loop,url):
    topicHtml = yield from wget(url)
    if len(topicHtml) > 100:
        soup = BeautifulSoup(topicHtml, 'html.parser')
        topic = []
        isImg = False
        m = re.match(r'^http.+/(\d+)', url)
        topicId = m.group(1) if m is not None else 0
        for content in soup.find_all(checkTopic):
            img = content.find(class_="BDE_Image")
            if img is not None and isImg == False:
                isImg = True
                imgUrl = img.get("src")
            topic.append(content.get_text().strip())
        if isImg and len(topic) > 6:
            topicTitle = soup.title.string
            m = re.match(r'^http.+/(.+)', imgUrl)
            imgName = m.group(1)
            logging.info('=============%s==============' % soup.title.string)
            logging.info(url)
            yield from orm.create_pool(loop=loop,host=dbHost, user=dbUser, password=dbPassword, db=dbName)
            num = yield from WeaponChangeTopic.findNumber('id', 'topicId=?', topicId)
            if num is None:
                words = yield from getImgWords(imgUrl)
                try:
                    wordList = json.loads(words)['retData']
                    if len(wordList) > 0:
                        logging.info(wordList[0]["word"].strip())
                        weaponTitle = wordList[0]["word"]
                        weapon = WeaponChangeTopic(comefrom = "tieba",
                                                   topicId = topicId,
                                                   title = weaponTitle,
                                                   details = json.dumps(wordList),
                                                   topicTitle = topicTitle,
                                                   topicList = json.dumps(topic),
                                                   img = imgName
                                )
                        yield from weapon.save()
                except ValueError as e:
                    print("baidu ocr json error: %s" % words)
Example #3
0
def getTopic(loop,url):
    # 模拟手机header
    topicHtml = yield from wget(url,
            {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'}
            )
    if len(topicHtml) > 100:
        soup = BeautifulSoup(topicHtml, 'html.parser')
        topic = []
        isImg = False
        m = re.match(r'^http.+tid=(\d+)', url)
        topicId = m.group(1) if m is not None else 0
        # 帖子列表
        for content in soup.find_all(checkTopic):
            topic.append(content.get_text().strip())
        topicTitle = soup.find("h2").get_text().replace(' ','').strip()
        topicTitle = topicTitle.replace('只看楼主','')
        #print('-------------------------------')
        #print(topicTitle)
        #print(url)
        # 获取图片页面链接
        imgA = soup.find("ul", class_="img_one")
        if imgA is not None:
            imgUrl = imgA.find("a").get("href")
            if imgUrl is not None:
                # 从图片页面获取图片真实地址
                imgHtml = yield from wget('http://bbs.d.163.com/'+imgUrl,
                                            {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'}
                                            )

                if len(imgHtml) > 100:
                    soup2 = BeautifulSoup(imgHtml, 'html.parser')
                    imgUrl = soup2.find("img", class_="postalbum_i").get("orig")
                    if imgUrl is not None:
                        isImg = True

        if isImg and len(topic) > 6:
            m = re.match(r'^http.+/(.+)\.(.+)', imgUrl)
            imgType = m.group(2)
            imgName = m.group(1) + "." + imgType
            # baidu ocr不支持png
            if imgType == "png":
                return
            yield from orm.create_pool(loop=loop,host=dbHost, user=dbUser, password=dbPassword, db=dbName)
            num = yield from WeaponChangeTopic.findNumber('id', 'topicId=?', topicId)
            if num is None:
                words = yield from getImgWords(imgUrl)
                try:
                    wordList = json.loads(words)['retData']
                    if len(wordList) > 0:
                        logging.info(wordList[0]["word"].strip())
                        weaponTitle = wordList[0]["word"]
                        #print(weaponTitle)
                        #print(imgUrl)
                        weapon = WeaponChangeTopic(comefrom = "163",
                                                   topicId = topicId,
                                                   title = weaponTitle,
                                                   details = json.dumps(wordList),
                                                   topicTitle = topicTitle,
                                                   topicList = json.dumps(topic),
                                                   img = imgName
                                )
                        yield from weapon.save()
                except ValueError as e:
                    print("baidu ocr json error: %s" % words)
Example #4
0
def getTopic(loop, url):
    # 模拟手机header
    topicHtml = yield from wget(
        url, {
            'User-Agent':
            'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'
        })
    if len(topicHtml) > 100:
        soup = BeautifulSoup(topicHtml, 'html.parser')
        topic = []
        isImg = False
        m = re.match(r'^http.+tid=(\d+)', url)
        topicId = m.group(1) if m is not None else 0
        # 帖子列表
        for content in soup.find_all(checkTopic):
            topic.append(content.get_text().strip())
        topicTitle = soup.find("h2").get_text().replace(' ', '').strip()
        topicTitle = topicTitle.replace('只看楼主', '')
        #print('-------------------------------')
        #print(topicTitle)
        #print(url)
        # 获取图片页面链接
        imgA = soup.find("ul", class_="img_one")
        if imgA is not None:
            imgUrl = imgA.find("a").get("href")
            if imgUrl is not None:
                # 从图片页面获取图片真实地址
                imgHtml = yield from wget(
                    'http://bbs.d.163.com/' + imgUrl, {
                        'User-Agent':
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'
                    })

                if len(imgHtml) > 100:
                    soup2 = BeautifulSoup(imgHtml, 'html.parser')
                    imgUrl = soup2.find("img",
                                        class_="postalbum_i").get("orig")
                    if imgUrl is not None:
                        isImg = True

        if isImg and len(topic) > 6:
            m = re.match(r'^http.+/(.+)\.(.+)', imgUrl)
            imgType = m.group(2)
            imgName = m.group(1) + "." + imgType
            # baidu ocr不支持png
            if imgType == "png":
                return
            yield from orm.create_pool(loop=loop,
                                       host=dbHost,
                                       user=dbUser,
                                       password=dbPassword,
                                       db=dbName)
            num = yield from WeaponChangeTopic.findNumber(
                'id', 'topicId=?', topicId)
            if num is None:
                words = yield from getImgWords(imgUrl)
                try:
                    wordList = json.loads(words)['retData']
                    if len(wordList) > 0:
                        logging.info(wordList[0]["word"].strip())
                        weaponTitle = wordList[0]["word"]
                        #print(weaponTitle)
                        #print(imgUrl)
                        weapon = WeaponChangeTopic(
                            comefrom="163",
                            topicId=topicId,
                            title=weaponTitle,
                            details=json.dumps(wordList),
                            topicTitle=topicTitle,
                            topicList=json.dumps(topic),
                            img=imgName)
                        yield from weapon.save()
                except ValueError as e:
                    print("baidu ocr json error: %s" % words)