Ejemplo n.º 1
0
 def __init__(self):
     self._logger = Logger(__file__)
     #profile = FirefoxProfile()
     #profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
     #self._browser = webdriver.Firefox(profile)
     self._browser = webdriver.Firefox()
     self.baidu = Baidu(self._browser)
     self.map = BaiduMap()
     self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy"
     self.table = Tables();
Ejemplo n.º 2
0
    def __init__(self):
        self.interrupted = False

        # capture SIGINT signal, e.g., Ctrl+C
        signal.signal(signal.SIGINT, self.signal_handler)
        print('Listening... Press Ctrl+C to exit')

        self.baidu = Baidu()
        self.tuling = TuLing()
        self.netease = NetEase()
Ejemplo n.º 3
0
class ChatBot:

    def __init__(self):
        self.interrupted = False

        # capture SIGINT signal, e.g., Ctrl+C
        signal.signal(signal.SIGINT, self.signal_handler)
        print('Listening... Press Ctrl+C to exit')

        self.baidu = Baidu()
        self.tuling = TuLing()
        self.netease = NetEase()

    def signal_handler(self, signal, frame):
        self.interrupted = True

    def interrupt_callback(self):
        return self.interrupted

    def chat(self):
        # ding
        util.audio_play('snowboy/resources/ding.wav')

        # 录制音频
        util.audio_record('audio/audio.wav')
        # 语音识别
        response = self.baidu.asr('audio/audio.wav')

        # dong
        util.audio_play('snowboy/resources/dong.wav')

        if response['err_no'] == 0:
            question = response['result'][0]
            print('Q: ' + question)

            # 比较粗糙的实现......
            if question.find('播放') == -1:
                # 智能问答
                answer = self.tuling.tuling(question)
                print('A: ' + answer)

                # 语音合成
                self.baidu.synthesis(answer, 'audio/audio.mp3')
                # 播放音频
                util.audio_play('audio/audio.mp3')
            else:
                # 下载歌曲
                song_name = question[2:]
                self.netease.download_song(song_name, 'audio/song.mp3')
                # 播放音频
                util.audio_play('audio/song.mp3')
        else:
            print('%d: %s' % (response['err_no'], response['err_msg']))
Ejemplo n.º 4
0
 def start(self):
     self.LWweb.clear()
     self.page = 1
     s = self.CBBox.currentText()
     if s == "baidu":
         self.ClassSearch = Baidu()
     elif s == "360":
         self.ClassSearch = Haosou()
     self.search()
Ejemplo n.º 5
0
 def __init__(self):
     self._logger = Logger(__file__)
     # the entry point of grabing 
     self.base="http://scenic.cthy.com"
     self.provinces = []
     #self._browser = webdriver.PhantomJS()
     self._browser = webdriver.Firefox()
     self.tabopera = Tables();
     self.record = open("record.txt","a+")
     self.fdate = open("date.txt","a+")
     self.fprice = open("price.txt","a+")
     self.sprovince = 0
     self.spage = 1
     self.snum = 0
     self.picturenum = 10
     self.baidu = Baidu(self._browser)
     self.map = BaiduMap()
     self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy"
Ejemplo n.º 6
0
    def baidu_process(self):
        file_path = self.file_path
        subtitle = self.sub
        per = self.ui.per.checkedId()
        spd = self.ui.spd_bd.value()
        vol = self.ui.vol_bd.value()
        pit = self.ui.pit_bd.value()
        options = {'per': per, 'spd': spd, 'vol': vol, 'pit': pit}

        with open('setting.json', 'r') as ff:
            setting = json.load(ff)
            bd_setting = setting.get('baidu', {})

        baidu = Baidu(bd_setting, options)
        for index, i in enumerate(subtitle):
            file_name = f'{file_path}/audio/{index + 1}.mp3'
            self.statusBar().showMessage(f'正在下载第{index + 1}句:{i}')
            baidu.process(i, file_name)

        self.statusBar().showMessage(f'下载完成!')
Ejemplo n.º 7
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'scwx.newssc.org', 'news', '四川外宣网')
    obj.main()
Ejemplo n.º 8
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'www.cdwb.com.cn', 'news', '成都晚报')
    obj.main()
Ejemplo n.º 9
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'www.sc.xinhuanet.com', 'news', '新华网四川频道')
    obj.main()
Ejemplo n.º 10
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'xnsb.newssc.org', 'news', '西南商报')
    obj.main()
Ejemplo n.º 11
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'scol.com.cn', 'news', '四川日报')
    obj.main()
Ejemplo n.º 12
0
from baidu import Baidu
from bing import Bing
from dailymotion import Dailymotion
from duckduckgo import Duckduckgo
from exalead import Exalead
from google import Google
from mojeek import Mojeek
from parsijoo import Parsijoo
from quora import Quora
from yahoo import Yahoo
from yandex import Yandex
from youtube import Youtube

scrapers = {
    'ask': Ask(),
    'baidu': Baidu(),
    'bing': Bing(),
    'dailymotion': Dailymotion(),
    'duckduckgo': Duckduckgo(),
    'exalead': Exalead(),
    'google': Google(),
    'mojeek': Mojeek(),
    'parsijoo': Parsijoo(),
    'quora': Quora(),
    'yahoo': Yahoo(),
    'yandex': Yandex(),
    'youtube': Youtube()
}


def small_test():
Ejemplo n.º 13
0
class Grab:
    def __init__(self):
        self._logger = Logger(__file__)
        #profile = FirefoxProfile()
        #profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
        #self._browser = webdriver.Firefox(profile)
        self._browser = webdriver.Firefox()
        self.baidu = Baidu(self._browser)
        self.map = BaiduMap()
        self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy"
        self.table = Tables();

    def __del__(self):
        self._browser.quit()
        self.record.close()

    def loadData(self):
        with open('allprovinces.json') as json_file:
            data = json.load(json_file)
        return data

    def getData(self):
        data = self.loadData()
        if not data:
            return None
        for pro in data["provincesList"]:
            cities = pro["Citys"]
            proname = pro["Name"]
            if ( len(cities) > 0 ):
                for city in cities:
                    name = city["Name"]
                    self._logger.info("current city: "+name)
                    cityInfo = self.baidu.baike(name)
                    if not cityInfo:
                        continue
                    cityBasic = cityInfo["basic"]
                    summary = cityInfo["summary"]
                    cityImage = self.baidu.niceImage(name+'壁纸',width=1300,height=750)
                    cityGeo = self.map.getGeoAddress(name,self.ak)
                    if cityGeo:
                        if "location" in cityGeo.keys():
                            location = cityGeo["location"]
                            lng = location["lng"]
                            lat = location["lat"]
                    else:
                        lng = "0.0"
                        lat = "0.0"
                    cityID = city["Id"]
                    zoneNum = zipCode = area = climate = ptype = acreage = ""
                    if u"电话区号" in cityBasic:
                        zoneNum = cityBasic[u"电话区号"]
                    if u"邮政区码" in cityBasic:
                        zipCode = cityBasic[u"邮政区码"]
                    if u"地理位置" in cityBasic:
                        area = cityBasic[u"地理位置"]
                    if u"气候条件" in cityBasic:
                        climate = cityBasic[u"气候条件"]
                    if u"行政区类别" in cityBasic:
                        ptype = cityBasic[u"行政区类别"]
                    if u"面    积" in cityBasic:
                        acreage = cityBasic[u"面    积"]
                    cityParams = (cityID,name,proname,ptype,area,zoneNum,acreage,climate,zipCode,lng,lat,summary)
                    self.table.insertTable("city",cityParams)
                    if  cityImage:
                        for pic in cityImage:
                            self.table.insertTable("cityImages",(cityID,str(uuid.uuid1()),pic,"",""))
Ejemplo n.º 14
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'chinawestnews.net', 'news', '中国西部网')
    obj.main()
Ejemplo n.º 15
0
import unittest, HTMLTestRunner
from baidu import Baidu
if __name__ == "__main__":
    # unittest.main()
    suite = unittest.TestSuite()
    # suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Baidu))
    suite.addTest(Baidu('test_baidu'))
    #  定义生成测试报告的名称
    filename1 = "result.html"
    fp = open(filename1, 'wb')
    # 定义测试报告的路径,标题,描述等内容
    runner = HTMLTestRunner.HTMLTestRunner(stream=fp,
                                           title=u'自动化测试报告',
                                           description=u'自动化测试报告')
    runner.run(suite)
Ejemplo n.º 16
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'sc.sina.com.cn', 'news', '新浪四川')
    obj.main()
Ejemplo n.º 17
0
from aiohttp import web
from ting89 import Ting89
# from woai import Woai
from baidu import Baidu
from ysts8 import Ysts8

GTing89 = Ting89()
# Gwa = Woai()
GBaidu = Baidu()
GYsts8 = Ysts8()


async def handle(request):
    return web.Response(text='only post support')


async def searchHandler(request):
    inData = await request.json()
    inName = inData['name']

    # print('searching',inName)
    data1 = GTing89.search(inName)
    data2 = GYsts8.search(inName)
    data3 = GBaidu.search(inName)
    outData = [{
        'mod': 'ting89',
        'data': data1
    }, {
        'mod': 'ysts8',
        'data': data2
    }, {
Ejemplo n.º 18
0
from ask import Ask
from yandex import Yandex
from baidu import Baidu
from exalead import Exalead
from quora import Quora
from youtube import Youtube
from parsijoo import Parsijoo

scrapers = {
    'g': Google(),
    'b': Bing(),
    'y': Yahoo(),
    'd': Duckduckgo(),
    'a': Ask(),
    'yd': Yandex(),
    'u': Baidu(),
    'e': Exalead(),
    'q': Quora(),
    't': Youtube(),
    'p': Parsijoo()
}


def read_in():
    lines = sys.stdin.readlines()
    return json.loads(lines[0])


def small_test():
    assert isinstance(scrapers.google.results_search('fossasia'), list)
Ejemplo n.º 19
0
def main(id):
    #若是博客,则把类型news改为blog
    obj = Baidu(id, 'www.scwmw.gov.cn', 'news', '四川文明网')
    obj.main()
Ejemplo n.º 20
0
c2e_opt.model = opt.c2e_model
c2e = C2E(c2e_opt)

e2c_opt = opt
e2c_opt.bpe_codes = opt.e2c_codes
e2c_opt.model = opt.e2c_model
e2c = E2C(e2c_opt)

#### youdao info
youdao_app_id = "5dcd671707dab45f"
youdao_key = "Z4h3RJgLdmsKro9148kxm13zzHh9YjkI"
youdao_app = Youdao(youdao_app_id, youdao_key)

baidu_app_id = "20160825000027412"
baidu_key = "nqZwgqSR74topEKu8MGL"
baidu_app = Baidu(baidu_app_id, baidu_key)

google_app = Google()
#######

###


def cmcmTran(doc, direction):
    if direction == 'e2c':
        rstr = e2c.translate(doc.strip())
    else:
        rstr = c2e.translate(doc.strip())
    return rstr.strip()

Ejemplo n.º 21
0
class Grab:
    def __init__(self):
        self._logger = Logger(__file__)
        # the entry point of grabing 
        self.base="http://scenic.cthy.com"
        self.provinces = []
        #self._browser = webdriver.PhantomJS()
        self._browser = webdriver.Firefox()
        self.tabopera = Tables();
        self.record = open("record.txt","a+")
        self.fdate = open("date.txt","a+")
        self.fprice = open("price.txt","a+")
        self.sprovince = 0
        self.spage = 1
        self.snum = 0
        self.picturenum = 10
        self.baidu = Baidu(self._browser)
        self.map = BaiduMap()
        self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy"
    def __del__(self):
        self._browser.quit()
        self.record.close()

    def getProvinces(self):
        '''Get the information of link, area and the number of provinces.
        # Process:
            1): To get the source code of the entry point (http://scenic.cthy.com) with PhantomJS
            2): To find tag which contains the information of provinces
            3): Get link,area and number information of every province
        # Return: 
            The return value is a DataFrame contains the following attributes:
                index:  the specified number of a province which extract from link for further use
                link:   the relative web address of details
                area:   the name of province
        '''

        self._browser.get(self.base)
        entry = BeautifulSoup(self._browser.page_source)
        map = entry.find("map") 
        if map:
            # the pattern to extract number from link
            pattern = re.compile(r".*(\d\d)/")
            self._logger.info("got the the tag containing the information of provinces")
            for item in map.find_all("area"):
                number = re.findall(pattern,item.attrs["href"])
                if number:
                    self.provinces.append(number[0])
                else:
                    continue
        else:
            self._logger.info("sorry,did not get the province map data")
            return None
        return self.provinces

    def searchAll(self):
        for i in range(self.sprovince,len(self.provinces)):
            self.searchScenic(i)
        
    def searchScenic(self,num):
        """Extract scenics information of a spicified province.
        # Parameters:
        num: the number of a province which you want to grab scenic information
        # Return:
        """
        prefix = "/scenicSearch/"
        suffix = "-0-0-0-0-1.html"
        self._browser.get(self.base+prefix+str(self.provinces[num])+suffix)
        first = BeautifulSoup(self._browser.page_source)
        """ The content of tags:
        # the total records
        [<span class="f14 point">135</span>,
        # how many pages
        <span class="f14 point">14</span>,
        # the number of records of one page
        <span class="f14 point">10</span>]
        """
        palist = first.find(id="PagerList")
        if palist:
            tags = palist.select("li > span")
        else:
            return False
        if tags and len(tags) >= 2:
            pageCount = int(tags[1].string)
            self._logger.info("total: "+tags[0].string+" records. "+"total "+tags[1].string+" pages")
        else:
            return False

        for i in range(self.spage,pageCount+1):
            self.searchSeniceSpiPage(num,str(i))
        # it is import, it must be reset to 1
        self.spage = 1
        return True

    def searchSeniceSpiPage(self,num,pagenum):
        """Search scenics information from a specified page of a specified province
        # Parameters:
        num:  the number of a province which you want to grab scenic information
        page: where now you want to extract scenic information from
        # Return:
        """
        addr = "/scenicSearch/"+str(self.provinces[num])+"-0-0-0-0-"+str(pagenum)+".html"
        # record the current searching page
        self._browser.get(self.base+addr)
        page = BeautifulSoup(self._browser.page_source)
        sightTags = page.select("div.sightlist > div.sightshow > div.sightdetail > h4 > a") 
        link = ""
        if sightTags:
            for i in range(self.snum,len(sightTags)):
                # recording the number of province,page,item for recovery
                self.record.write(str(num)+" "+str(pagenum)+" "+str(i)+"\n")
                self._logger.info("current position: "+str(num)+" "+str(pagenum)+" "+str(i))
                self._logger.info("got the link of "+sightTags[i].string)
                link = sightTags[i].attrs["href"]
                self.extractScenicInfor(link)
        else:
            self._logger.error("searchSeniceSpiPage: can not get the list of scenics")
            return False
        # it is import, it must be reset to 1
        self.snum = 0
        return True

    def extractScenicInfor(self,link):
        """Extract a scenic information with the given scenic address
        # Parameters:
        link:  the address where you can get detailed information of scenic
        # Return:
        """
        scenic = self.extractScenicAbout(link)
        if not scenic:
            return False;
        scenic = self.remedy(scenic)
        scenic = self.remedyMap(scenic)
        self.tabopera.insertData(scenic)
        return True

    def remedy(self,scenic):
        """if the return of function  extractScenicAbout if not enough,we need to access baidu for more information
        """
        openpat = u"开放时间"
        suggpat = u"时长"
        areapat = u"面积"
        pricepat = u"门票"

        # this is for getting longitude and latitude
        scenic.mapname = scenic.name

        # remedy pictures
        picnum = len(scenic.images)
        if picnum < 10:
            self._logger.info("There are "+str(picnum)+" pictures.Getting the reset from baidu image")
            imgs = self.baidu.image(scenic.name,self.picturenum - len(scenic.images))
            if imgs:
                scenic.images.extend(imgs)

        if not scenic.description:
            self._logger.info("Got details from baike")
            baike = self.baidu.baike(scenic.name)
            if not baike:
                self._logger.error("Remedy: can not got information from baidu baike")
                return scenic
            if "detail" in baike.keys():
                scenic.description = baike["detail"]
        else:
            baike = self.baidu.baike(scenic.name,False)
            if not baike:
                self._logger.error("Remedy: can not got information from baidu baike")
                return scenic

        # use the name in baike for baidu searching
        if "name" in baike.keys():
            scenic.mapname = baike["name"]

        if "basic" in baike.keys():
            basic = baike["basic"]
            for item in basic.keys():
                if re.findall(openpat,item):
                    times = re.findall(r"(\d+[:|;]\d+).*(\d+[:|;]\d+)",basic[item])
                    if times:
                        scenic.opentime = times[0][0]
                        scenic.closetime = times[0][1]
                    else:
                        scenic.opentime = "00:00"
                        scenic.closetime = "23:00"
                if re.findall(suggpat,item):
                    scenic.suggest = basic[item]
                if re.findall(pricepat,item):
                    scenic.price = basic[item]
                if re.findall(areapat,item):
                    scenic.area = basic[item]
        if not scenic.opentime:
            scenic.opentime = "00:00"
        if not scenic.closetime:
            scenic.closetime = "23:00"
        if not scenic.price:
            scenic.price = "0"
        if not scenic.area:
            scenic.area = "未知"
        if not scenic.symbol:
            if scenic.images:
                scenic.symbol = scenic.images[0]
        return scenic

    def remedyMap(self,scenic):
        # map relatives:
        mapret = self.map.getGeoAddress(scenic.mapname,self.ak)
        if mapret:
            if "location" in mapret.keys():
                scenic.latitude = "%.13f" % mapret["location"]["lat"]
                scenic.longitude = "%.13f" % mapret["location"]["lng"]
            if "precise" in mapret.keys():
                scenic.precise = str(mapret["precise"])
            if "confidence" in mapret.keys():
                scenic.confidence = str(mapret["confidence"])
        return scenic
        
    def extractScenicAbout(self,link):
        """Extract the information of introduction,geographic postion,type,quality,class 
        # Parameters:
        link:  the address where you can get detailed information of scenic

        # Return:
        the return value is a dict which has fowllowing attrs:
        province: 
        city:
        types:
        level:
        fits:
        description:
        images:
        """
        scenic = Scenic()
        # got the symbol picture and the name of scenic at index page
        self._browser.get(link)
        first = BeautifulSoup(self._browser.page_source)
        symbol = first.select("div.sightfocuspic > img")
        if symbol:
            scenic.symbol = symbol[0].attrs["src"] and self.base+symbol[0].attrs["src"] or ""
        scename = first.select("div.sightprofile > h4")
        if scename:
            scenic.name = scename[0].string

        # if canot get the scenic name,it means the pages is wrong
        else:
            self._logger.error("Cannot got the scenic name. Is the page is wrong,please check it")
            return None
        # get detailed information about scenic at about page
        addr = link+"about.html"
        self._browser.get(addr)
        about = BeautifulSoup(self._browser.page_source)
        relative = about.select("div.main > div.wrap > div.pright > div.pfood > ul#RightControl11_ScenicBaseInfo > li")
        if len(relative) == 5:
            # get province and city information
            pos = relative[0].select("a")
            # It will only be right when we got two extract two infor
            if len(pos) == 2:
                if pos[0].string:
                    scenic.province = pos[0].string
                if pos[1].string:
                    scenic.city = pos[1].string
                self._logger.info("current position: province: "+scenic.province+" city: "+scenic.city)
            else:
                return None
            # get the type of scenic
            for item in relative[1].select("a"):
                if item.string:
                    scenic.types.append(item.string)
            # get the quality of scenic
            qua = relative[2].find("a")
            if qua:
                scenic.quality = qua.string
            # get the scenic level
            lev = relative[3].find("a")
            if lev:
                scenic.level = lev.string
            # get the fit time of the scenic
            for item in relative[4].select("a"):
                if item.string:
                    scenic.fits.append(item.string)
        else:
            self._logger.error("there is not ralative information"+str(len(relative)))
            return None

        # get the description of the scenic
        desc = about.find(id="AboutInfo")
        if desc:
            for s in desc.stripped_strings:
                scenic.description = scenic.description + s + "\n"
            for item in desc.find_all("p"):
                # if a tag p contains image address,it always has the style or align attr
                attrs = item.attrs
                if "style" in attrs.keys() or "align" in attrs.keys():
                    for img in item.find_all("img"):
                        if not img.attrs["src"]:
                            continue
                        scenic.images.append(self.base+img.attrs["src"])
                else:
                    for s in item.stripped_strings:
                        scenic.description = scenic.description + s + "\n" 
        else:
            self._logger.info("there is no description information and scenic pictures")
        scenic.website = link
        return scenic

    def extractScenicAttractions(self,link):
        """extract information of attractions of a specified scenic
        # Parameters:
        link:  the address where you can get attractions of scenic
        # Return:
        The return value is a list which the item is dict,each item contains the following attrs:
        
        """
        attractions = []
        addr = link+"about.html"
        self._browser.get(addr)
        page = BeautifulSoup(self._browser.page_source)
        lists = page.select("")

    def startGrab(self):
        content = self.record.readlines()
        # if do not have record
        if len(content) != 0:
            line = content[len(content)-1]
            strs = line.split(" ")
            self.sprovince = int(strs[0])
            self.spage = int(strs[1])
            self.snum = int(strs[2])
        self.getProvinces()
        self.searchAll()