def main(self): """ 生成测试报告主函数 根据status yaml的文件来生成测试报告 :return: """ import GetHtml self.__analyze_log() result = self.__yaml_file(self.all_result_path, '.yaml') lst = [] for case_name, confirm_status in result.items(): case_name = str(case_name).split('.')[0] case_result = self.__open_yaml(confirm_status) case_img = self.__confirm_file( str(confirm_status).replace('status', 'img').replace('yaml', 'png')) case_per = self.__confirm_file( str(confirm_status).replace('status', 'per').replace('yaml', 'png')) case_log = self.__confirm_file( str(confirm_status).replace('status', 'log').replace('yaml', 'log')) case_filter = self.__confirm_file( str(confirm_status).replace('status', 'log').replace( 'yaml', 'log').replace(case_name, case_name + 'filter')) if case_per is None: # 获取error图片 ini = U.ConfigIni() case_per = ini.get_ini('test_case', 'error_img') lst.append( GetHtml.get_html_tr(case_name, case_result, case_img, case_per, case_log, case_filter)) GetHtml.get_html(''.join(lst), self.__app_info(), self.__device_info(), self.__test_case_execution_status(), self.all_result_path)
def main(): startTime = time.clock() for i in range(1,100): timeStart = time.perf_counter() print(timeStart) url = "https://cd.fang.lianjia.com/loupan/pg{}/".format(i) # 获取url的html html = GetHtml.getHTMLTest(url) # 将html数据放入uinfo allhouser = parserList(html) # print(allhouser) for houser in allhouser: print(houser) # # 集中写入数据库 # obj = OrmTest() # rest = obj.add_one(houser) # print(rest.id) print(time.perf_counter() - timeStart) endTime = time.clock()-startTime print("爬取完毕,爬取时长{}".format(endTime))
def main(): slistObj = StatusList() result = slistObj.test_search() for son in result: print(son.SonUrl) url = "https://cd.fang.lianjia.com/loupan/p_{}/huxingtu/".format( son.SonUrl) # 获取url的html html = GetHtml.getHTMLTest(url) houseList = [] if len(html) > 1: print("请求到数据") # 解析html houseList = parserList(html) else: print("不存在户型数据") print(houseList) for house in houseList: house.append(son.SonUrl) house.append(son.name) # print(len(house)) # print(house) # print(house[0]) HouseObj = HouseList() HouseObj.add_one(house)
def main(): # startTime = time.clock() # for i in range(1,100): # timeStart = time.perf_counter() # print(timeStart) url = "https://cd.fang.lianjia.com/loupan/pg{}/".format(1) # 获取url的html html = GetHtml.getHTMLTest(url) # 将html数据放入uinfo allhouser = parserList(html) print(allhouser) for houser in allhouser: # print(len(houser[8])) print(houser)
def main(): startTime = time.clock() # 查询目录 # catalog_obj = SQLCatalogOEM.OrmTest() # result = catalog_obj.test_search() staList_obj = StatusList() result = staList_obj.test_search() obj = Picture() for item in result: status = obj.selectBySonUrl(item.SonUrl) # print(status) if not status: # print("当前不存在,进行爬取") timeStart = time.perf_counter() # print(item.SonUrl,item.name) HtmlText = GetHtml.getHTMLTest( "https://cd.fang.lianjia.com/loupan/p_{}/xiangce/".format( item.SonUrl)) # 获取全部图片 items = parserList(HtmlText) if len(items) > 0: # print(items) # print(len(items)) print(item.SonUrl, item.name) images_obj = itemToNews(items) images_obj.name = item.name images_obj.SonUrl = item.SonUrl # images_obj.ImageBackup = ','.join(items) # print(images_obj.licenseImages) # 集中写入数据库 obj = Picture() rest = obj.add_one(images_obj) print("第{}条爬取完成,用时 {}".format(rest.id, time.perf_counter() - timeStart)) else: print("数据重复,重复Url:{}".format(item.SonUrl)) endTime = time.clock() - startTime print("爬取完毕,爬取时长{}".format(endTime))
def mainStatusList(): startTime = time.clock() for i in range(1, 39): timeStart = time.perf_counter() url = "https://cd.fang.lianjia.com/loupan/nht{}pg{}/".format(5, i) # 获取url的html html = GetHtml.getHTMLTest(url) # 将html数据放入uinfo allhouser = parserUpdateList(html) # allhouser = parserList(html) print(allhouser) for houser in allhouser: # print(len(houser[8])) print(houser) # 集中写入数据库 obj = OrmTest() rest = obj.add_one(houser) print(rest.id) print(time.perf_counter() - timeStart) endTime = time.clock() - startTime print("爬取完毕,爬取时长{}".format(endTime))
def get_article(Blog,url): html = GetHtml.GetHtml(url, Blog) print(html.get_cnt()) article_list = html.get_article_list() try: os.mkdir('source') except: pass for article in article_list: with open('source/' + article.title + '.md', 'w', encoding='utf-8') as w: text = "---\n" + 'title: ' + article.title + "\n" + 'date: ' + article.time + "\n" if article.category != '': text += 'categories:\n- ' + article.category + "\n" if article.tag != []: text += 'tags:\n' for t in article.tag: text += '- ' + t + "\n" text += "---\n" Markdown = HtmlToMarkdown.HtmlToMarkdown(article.text) text += Markdown.get_string() w.write(text)
print(CurtimeString) ##################打开数据库,创建urllist表 pDataBase = mysqlop.open(g_pDataBaseName) mysqlop.test(pDataBase) mysqlop.createtable(pDataBase, g_pTableName) ################## #爬完首页之后所有URL都会写入到数据库testdb的urllist表中 #################删除输出目录 if os.path.exists(g_SaveDir) == True: shutil.rmtree(g_SaveDir) os.mkdir(g_SaveDir) ################# MainHTMLDir = g_SaveDir + '\\MainDir' mysqlop.insert(pDataBase, 'urllist', 0, g_Web, 'NoClimb') mysqlop.close(pDataBase) GetHtml.ClimbHtml(g_Web, MainHTMLDir) pDataBase = mysqlop.open(g_pDataBaseName) urlnum = mysqlop.getitemnum(pDataBase, g_pTableName) print('Total urlnum = %d' % urlnum) #跳过MainDir.ID为0是MainDir,从表单ID为1开始爬. g_CurUrlID = 1 while 1: if (g_CurUrlID < urlnum): #爬一次读取到的URL数量 savepath = g_SaveDir + "\\out%d" % g_CurUrlID Cururl = mysqlop.read(pDataBase, g_pTableName, g_CurUrlID) print('----------------------------------------') #print('SavePath = [%s]'%savepath) print('Climbing [%s] ...' % Cururl)
def maintest(): url = "https://cd.fang.anjuke.com/loupan/all/p1/" # 获取url的html html = GetHtml.getHTMLTest(url) parserList(html)
# 日付処理、ディレクトリ処理で利用 import datetime import os # 一定時間の待ちを入れるために利用 import time import GetHtml # 日付のディレクトリがなければ作成する today = datetime.date.today() dirname = today.strftime('%Y%m%d') if not os.path.exists(dirname): os.mkdir(dirname) #start_url = 'http://resource.pcassist.co.jp/sozai/IT56/chapter13/sample13_3_2.html' start_url = 'http://shop.zhongyeyuan.com.cn/' if __name__ == '__main__': print("----- Zac START -----") GetHtml.get_html(start_url, dirname) print("----- Zac END -----")
def getOtherText(projectName, id): global numberall startTime = time.perf_counter() # print("房屋id>>>>{}".format(id)) # 获取房屋的详细数据 otherHtml = GetHtml.getHTMLTest( "https://cd.fang.lianjia.com/loupan/p_{}/?fb_expo_id=".format( projectName, id)) # 将房屋的详细数据进行解析 resultSoup = bs4.BeautifulSoup(otherHtml, "html.parser") # 获取详细数据的div标签 allResultTag = resultSoup.find(name='div', attrs={'class': 'box-loupan'}) # print("房屋详情".center(40, "-")) otherlist = [] for tagitem in allResultTag: if isinstance(tagitem, bs4.element.Tag): for tag in tagitem: # 内容开始 if isinstance(tag, bs4.element.Tag): # print("{} >>> {}".format(tag.string, type(tag))) for t1 in tag: if isinstance(t1, bs4.element.Tag): for t2 in t1: if isinstance(t2, bs4.element.Tag): for t3 in t2: if isinstance(t3, bs4.element.Tag): for t4 in t3: # print(type(t4)) # print("{}----{}".format(len(t4.string), t4.replace(' ', '').replace('\n', '').replace(';', ''))) # print(t4.string.strip(),end=" ") otherlist.append( t4.replace(' ', '').replace( '\n', '').replace( ';', '')) elif isinstance( t3, bs4.element.NavigableString): if len(t3.string) > 1: # 有效数据 # print("{}---{}".format(len(t3.string), t3.string.replace(' ', '').replace('\n', '').replace(';', ''))) # print(t3.string.strip(),end=" ") otherlist.append( t3.string.replace( ' ', '').replace( '\n', '').replace( ';', '')) elif isinstance(t1, bs4.element.NavigableString): if len(t1.string) > 1: # 有效数据 # print("{}-{}".format(len(t1.string),t1.string.replace(' ', '').replace('\n', '').replace(';', ''))) # print(t1.string.strip(),end=" ") otherlist.append( t1.string.replace(' ', '').replace( '\n', '').replace(';', '')) print("第{}条,耗时{}秒".format(numberall, time.perf_counter() - startTime)) return otherlist
def test(): HtmlText = GetHtml.getHTMLTest( "https://cd.fang.lianjia.com/loupan/p_{}/xiangce/".format(item.SonUrl)) # 获取全部图片 items = parserList(HtmlText)