def __init__(self): self._logger = Logger(__file__) #profile = FirefoxProfile() #profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #self._browser = webdriver.Firefox(profile) self._browser = webdriver.Firefox() self.baidu = Baidu(self._browser) self.map = BaiduMap() self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy" self.table = Tables();
def __init__(self): self.interrupted = False # capture SIGINT signal, e.g., Ctrl+C signal.signal(signal.SIGINT, self.signal_handler) print('Listening... Press Ctrl+C to exit') self.baidu = Baidu() self.tuling = TuLing() self.netease = NetEase()
class ChatBot: def __init__(self): self.interrupted = False # capture SIGINT signal, e.g., Ctrl+C signal.signal(signal.SIGINT, self.signal_handler) print('Listening... Press Ctrl+C to exit') self.baidu = Baidu() self.tuling = TuLing() self.netease = NetEase() def signal_handler(self, signal, frame): self.interrupted = True def interrupt_callback(self): return self.interrupted def chat(self): # ding util.audio_play('snowboy/resources/ding.wav') # 录制音频 util.audio_record('audio/audio.wav') # 语音识别 response = self.baidu.asr('audio/audio.wav') # dong util.audio_play('snowboy/resources/dong.wav') if response['err_no'] == 0: question = response['result'][0] print('Q: ' + question) # 比较粗糙的实现...... if question.find('播放') == -1: # 智能问答 answer = self.tuling.tuling(question) print('A: ' + answer) # 语音合成 self.baidu.synthesis(answer, 'audio/audio.mp3') # 播放音频 util.audio_play('audio/audio.mp3') else: # 下载歌曲 song_name = question[2:] self.netease.download_song(song_name, 'audio/song.mp3') # 播放音频 util.audio_play('audio/song.mp3') else: print('%d: %s' % (response['err_no'], response['err_msg']))
def start(self): self.LWweb.clear() self.page = 1 s = self.CBBox.currentText() if s == "baidu": self.ClassSearch = Baidu() elif s == "360": self.ClassSearch = Haosou() self.search()
def __init__(self): self._logger = Logger(__file__) # the entry point of grabing self.base="http://scenic.cthy.com" self.provinces = [] #self._browser = webdriver.PhantomJS() self._browser = webdriver.Firefox() self.tabopera = Tables(); self.record = open("record.txt","a+") self.fdate = open("date.txt","a+") self.fprice = open("price.txt","a+") self.sprovince = 0 self.spage = 1 self.snum = 0 self.picturenum = 10 self.baidu = Baidu(self._browser) self.map = BaiduMap() self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy"
def baidu_process(self): file_path = self.file_path subtitle = self.sub per = self.ui.per.checkedId() spd = self.ui.spd_bd.value() vol = self.ui.vol_bd.value() pit = self.ui.pit_bd.value() options = {'per': per, 'spd': spd, 'vol': vol, 'pit': pit} with open('setting.json', 'r') as ff: setting = json.load(ff) bd_setting = setting.get('baidu', {}) baidu = Baidu(bd_setting, options) for index, i in enumerate(subtitle): file_name = f'{file_path}/audio/{index + 1}.mp3' self.statusBar().showMessage(f'正在下载第{index + 1}句:{i}') baidu.process(i, file_name) self.statusBar().showMessage(f'下载完成!')
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'scwx.newssc.org', 'news', '四川外宣网') obj.main()
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'www.cdwb.com.cn', 'news', '成都晚报') obj.main()
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'www.sc.xinhuanet.com', 'news', '新华网四川频道') obj.main()
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'xnsb.newssc.org', 'news', '西南商报') obj.main()
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'scol.com.cn', 'news', '四川日报') obj.main()
from baidu import Baidu from bing import Bing from dailymotion import Dailymotion from duckduckgo import Duckduckgo from exalead import Exalead from google import Google from mojeek import Mojeek from parsijoo import Parsijoo from quora import Quora from yahoo import Yahoo from yandex import Yandex from youtube import Youtube scrapers = { 'ask': Ask(), 'baidu': Baidu(), 'bing': Bing(), 'dailymotion': Dailymotion(), 'duckduckgo': Duckduckgo(), 'exalead': Exalead(), 'google': Google(), 'mojeek': Mojeek(), 'parsijoo': Parsijoo(), 'quora': Quora(), 'yahoo': Yahoo(), 'yandex': Yandex(), 'youtube': Youtube() } def small_test():
class Grab: def __init__(self): self._logger = Logger(__file__) #profile = FirefoxProfile() #profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #self._browser = webdriver.Firefox(profile) self._browser = webdriver.Firefox() self.baidu = Baidu(self._browser) self.map = BaiduMap() self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy" self.table = Tables(); def __del__(self): self._browser.quit() self.record.close() def loadData(self): with open('allprovinces.json') as json_file: data = json.load(json_file) return data def getData(self): data = self.loadData() if not data: return None for pro in data["provincesList"]: cities = pro["Citys"] proname = pro["Name"] if ( len(cities) > 0 ): for city in cities: name = city["Name"] self._logger.info("current city: "+name) cityInfo = self.baidu.baike(name) if not cityInfo: continue cityBasic = cityInfo["basic"] summary = cityInfo["summary"] cityImage = self.baidu.niceImage(name+'壁纸',width=1300,height=750) cityGeo = self.map.getGeoAddress(name,self.ak) if cityGeo: if "location" in cityGeo.keys(): location = cityGeo["location"] lng = location["lng"] lat = location["lat"] else: lng = "0.0" lat = "0.0" cityID = city["Id"] zoneNum = zipCode = area = climate = ptype = acreage = "" if u"电话区号" in cityBasic: zoneNum = cityBasic[u"电话区号"] if u"邮政区码" in cityBasic: zipCode = cityBasic[u"邮政区码"] if u"地理位置" in cityBasic: area = cityBasic[u"地理位置"] if u"气候条件" in cityBasic: climate = cityBasic[u"气候条件"] if u"行政区类别" in cityBasic: ptype = cityBasic[u"行政区类别"] if u"面 积" in cityBasic: acreage = cityBasic[u"面 积"] cityParams = (cityID,name,proname,ptype,area,zoneNum,acreage,climate,zipCode,lng,lat,summary) self.table.insertTable("city",cityParams) if cityImage: for pic in cityImage: self.table.insertTable("cityImages",(cityID,str(uuid.uuid1()),pic,"",""))
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'chinawestnews.net', 'news', '中国西部网') obj.main()
import unittest, HTMLTestRunner from baidu import Baidu if __name__ == "__main__": # unittest.main() suite = unittest.TestSuite() # suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Baidu)) suite.addTest(Baidu('test_baidu')) # 定义生成测试报告的名称 filename1 = "result.html" fp = open(filename1, 'wb') # 定义测试报告的路径,标题,描述等内容 runner = HTMLTestRunner.HTMLTestRunner(stream=fp, title=u'自动化测试报告', description=u'自动化测试报告') runner.run(suite)
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'sc.sina.com.cn', 'news', '新浪四川') obj.main()
from aiohttp import web from ting89 import Ting89 # from woai import Woai from baidu import Baidu from ysts8 import Ysts8 GTing89 = Ting89() # Gwa = Woai() GBaidu = Baidu() GYsts8 = Ysts8() async def handle(request): return web.Response(text='only post support') async def searchHandler(request): inData = await request.json() inName = inData['name'] # print('searching',inName) data1 = GTing89.search(inName) data2 = GYsts8.search(inName) data3 = GBaidu.search(inName) outData = [{ 'mod': 'ting89', 'data': data1 }, { 'mod': 'ysts8', 'data': data2 }, {
from ask import Ask from yandex import Yandex from baidu import Baidu from exalead import Exalead from quora import Quora from youtube import Youtube from parsijoo import Parsijoo scrapers = { 'g': Google(), 'b': Bing(), 'y': Yahoo(), 'd': Duckduckgo(), 'a': Ask(), 'yd': Yandex(), 'u': Baidu(), 'e': Exalead(), 'q': Quora(), 't': Youtube(), 'p': Parsijoo() } def read_in(): lines = sys.stdin.readlines() return json.loads(lines[0]) def small_test(): assert isinstance(scrapers.google.results_search('fossasia'), list)
def main(id): #若是博客,则把类型news改为blog obj = Baidu(id, 'www.scwmw.gov.cn', 'news', '四川文明网') obj.main()
c2e_opt.model = opt.c2e_model c2e = C2E(c2e_opt) e2c_opt = opt e2c_opt.bpe_codes = opt.e2c_codes e2c_opt.model = opt.e2c_model e2c = E2C(e2c_opt) #### youdao info youdao_app_id = "5dcd671707dab45f" youdao_key = "Z4h3RJgLdmsKro9148kxm13zzHh9YjkI" youdao_app = Youdao(youdao_app_id, youdao_key) baidu_app_id = "20160825000027412" baidu_key = "nqZwgqSR74topEKu8MGL" baidu_app = Baidu(baidu_app_id, baidu_key) google_app = Google() ####### ### def cmcmTran(doc, direction): if direction == 'e2c': rstr = e2c.translate(doc.strip()) else: rstr = c2e.translate(doc.strip()) return rstr.strip()
class Grab: def __init__(self): self._logger = Logger(__file__) # the entry point of grabing self.base="http://scenic.cthy.com" self.provinces = [] #self._browser = webdriver.PhantomJS() self._browser = webdriver.Firefox() self.tabopera = Tables(); self.record = open("record.txt","a+") self.fdate = open("date.txt","a+") self.fprice = open("price.txt","a+") self.sprovince = 0 self.spage = 1 self.snum = 0 self.picturenum = 10 self.baidu = Baidu(self._browser) self.map = BaiduMap() self.ak = "sh0wDYRg1LnB5OYTefZcuHu3zwuoFeOy" def __del__(self): self._browser.quit() self.record.close() def getProvinces(self): '''Get the information of link, area and the number of provinces. # Process: 1): To get the source code of the entry point (http://scenic.cthy.com) with PhantomJS 2): To find tag which contains the information of provinces 3): Get link,area and number information of every province # Return: The return value is a DataFrame contains the following attributes: index: the specified number of a province which extract from link for further use link: the relative web address of details area: the name of province ''' self._browser.get(self.base) entry = BeautifulSoup(self._browser.page_source) map = entry.find("map") if map: # the pattern to extract number from link pattern = re.compile(r".*(\d\d)/") self._logger.info("got the the tag containing the information of provinces") for item in map.find_all("area"): number = re.findall(pattern,item.attrs["href"]) if number: self.provinces.append(number[0]) else: continue else: self._logger.info("sorry,did not get the province map data") return None return self.provinces def searchAll(self): for i in range(self.sprovince,len(self.provinces)): self.searchScenic(i) def searchScenic(self,num): """Extract scenics information of a spicified province. # Parameters: num: the number of a province which you want to grab scenic information # Return: """ prefix = "/scenicSearch/" suffix = "-0-0-0-0-1.html" self._browser.get(self.base+prefix+str(self.provinces[num])+suffix) first = BeautifulSoup(self._browser.page_source) """ The content of tags: # the total records [<span class="f14 point">135</span>, # how many pages <span class="f14 point">14</span>, # the number of records of one page <span class="f14 point">10</span>] """ palist = first.find(id="PagerList") if palist: tags = palist.select("li > span") else: return False if tags and len(tags) >= 2: pageCount = int(tags[1].string) self._logger.info("total: "+tags[0].string+" records. "+"total "+tags[1].string+" pages") else: return False for i in range(self.spage,pageCount+1): self.searchSeniceSpiPage(num,str(i)) # it is import, it must be reset to 1 self.spage = 1 return True def searchSeniceSpiPage(self,num,pagenum): """Search scenics information from a specified page of a specified province # Parameters: num: the number of a province which you want to grab scenic information page: where now you want to extract scenic information from # Return: """ addr = "/scenicSearch/"+str(self.provinces[num])+"-0-0-0-0-"+str(pagenum)+".html" # record the current searching page self._browser.get(self.base+addr) page = BeautifulSoup(self._browser.page_source) sightTags = page.select("div.sightlist > div.sightshow > div.sightdetail > h4 > a") link = "" if sightTags: for i in range(self.snum,len(sightTags)): # recording the number of province,page,item for recovery self.record.write(str(num)+" "+str(pagenum)+" "+str(i)+"\n") self._logger.info("current position: "+str(num)+" "+str(pagenum)+" "+str(i)) self._logger.info("got the link of "+sightTags[i].string) link = sightTags[i].attrs["href"] self.extractScenicInfor(link) else: self._logger.error("searchSeniceSpiPage: can not get the list of scenics") return False # it is import, it must be reset to 1 self.snum = 0 return True def extractScenicInfor(self,link): """Extract a scenic information with the given scenic address # Parameters: link: the address where you can get detailed information of scenic # Return: """ scenic = self.extractScenicAbout(link) if not scenic: return False; scenic = self.remedy(scenic) scenic = self.remedyMap(scenic) self.tabopera.insertData(scenic) return True def remedy(self,scenic): """if the return of function extractScenicAbout if not enough,we need to access baidu for more information """ openpat = u"开放时间" suggpat = u"时长" areapat = u"面积" pricepat = u"门票" # this is for getting longitude and latitude scenic.mapname = scenic.name # remedy pictures picnum = len(scenic.images) if picnum < 10: self._logger.info("There are "+str(picnum)+" pictures.Getting the reset from baidu image") imgs = self.baidu.image(scenic.name,self.picturenum - len(scenic.images)) if imgs: scenic.images.extend(imgs) if not scenic.description: self._logger.info("Got details from baike") baike = self.baidu.baike(scenic.name) if not baike: self._logger.error("Remedy: can not got information from baidu baike") return scenic if "detail" in baike.keys(): scenic.description = baike["detail"] else: baike = self.baidu.baike(scenic.name,False) if not baike: self._logger.error("Remedy: can not got information from baidu baike") return scenic # use the name in baike for baidu searching if "name" in baike.keys(): scenic.mapname = baike["name"] if "basic" in baike.keys(): basic = baike["basic"] for item in basic.keys(): if re.findall(openpat,item): times = re.findall(r"(\d+[:|;]\d+).*(\d+[:|;]\d+)",basic[item]) if times: scenic.opentime = times[0][0] scenic.closetime = times[0][1] else: scenic.opentime = "00:00" scenic.closetime = "23:00" if re.findall(suggpat,item): scenic.suggest = basic[item] if re.findall(pricepat,item): scenic.price = basic[item] if re.findall(areapat,item): scenic.area = basic[item] if not scenic.opentime: scenic.opentime = "00:00" if not scenic.closetime: scenic.closetime = "23:00" if not scenic.price: scenic.price = "0" if not scenic.area: scenic.area = "未知" if not scenic.symbol: if scenic.images: scenic.symbol = scenic.images[0] return scenic def remedyMap(self,scenic): # map relatives: mapret = self.map.getGeoAddress(scenic.mapname,self.ak) if mapret: if "location" in mapret.keys(): scenic.latitude = "%.13f" % mapret["location"]["lat"] scenic.longitude = "%.13f" % mapret["location"]["lng"] if "precise" in mapret.keys(): scenic.precise = str(mapret["precise"]) if "confidence" in mapret.keys(): scenic.confidence = str(mapret["confidence"]) return scenic def extractScenicAbout(self,link): """Extract the information of introduction,geographic postion,type,quality,class # Parameters: link: the address where you can get detailed information of scenic # Return: the return value is a dict which has fowllowing attrs: province: city: types: level: fits: description: images: """ scenic = Scenic() # got the symbol picture and the name of scenic at index page self._browser.get(link) first = BeautifulSoup(self._browser.page_source) symbol = first.select("div.sightfocuspic > img") if symbol: scenic.symbol = symbol[0].attrs["src"] and self.base+symbol[0].attrs["src"] or "" scename = first.select("div.sightprofile > h4") if scename: scenic.name = scename[0].string # if canot get the scenic name,it means the pages is wrong else: self._logger.error("Cannot got the scenic name. Is the page is wrong,please check it") return None # get detailed information about scenic at about page addr = link+"about.html" self._browser.get(addr) about = BeautifulSoup(self._browser.page_source) relative = about.select("div.main > div.wrap > div.pright > div.pfood > ul#RightControl11_ScenicBaseInfo > li") if len(relative) == 5: # get province and city information pos = relative[0].select("a") # It will only be right when we got two extract two infor if len(pos) == 2: if pos[0].string: scenic.province = pos[0].string if pos[1].string: scenic.city = pos[1].string self._logger.info("current position: province: "+scenic.province+" city: "+scenic.city) else: return None # get the type of scenic for item in relative[1].select("a"): if item.string: scenic.types.append(item.string) # get the quality of scenic qua = relative[2].find("a") if qua: scenic.quality = qua.string # get the scenic level lev = relative[3].find("a") if lev: scenic.level = lev.string # get the fit time of the scenic for item in relative[4].select("a"): if item.string: scenic.fits.append(item.string) else: self._logger.error("there is not ralative information"+str(len(relative))) return None # get the description of the scenic desc = about.find(id="AboutInfo") if desc: for s in desc.stripped_strings: scenic.description = scenic.description + s + "\n" for item in desc.find_all("p"): # if a tag p contains image address,it always has the style or align attr attrs = item.attrs if "style" in attrs.keys() or "align" in attrs.keys(): for img in item.find_all("img"): if not img.attrs["src"]: continue scenic.images.append(self.base+img.attrs["src"]) else: for s in item.stripped_strings: scenic.description = scenic.description + s + "\n" else: self._logger.info("there is no description information and scenic pictures") scenic.website = link return scenic def extractScenicAttractions(self,link): """extract information of attractions of a specified scenic # Parameters: link: the address where you can get attractions of scenic # Return: The return value is a list which the item is dict,each item contains the following attrs: """ attractions = [] addr = link+"about.html" self._browser.get(addr) page = BeautifulSoup(self._browser.page_source) lists = page.select("") def startGrab(self): content = self.record.readlines() # if do not have record if len(content) != 0: line = content[len(content)-1] strs = line.split(" ") self.sprovince = int(strs[0]) self.spage = int(strs[1]) self.snum = int(strs[2]) self.getProvinces() self.searchAll()