Python GetHtml Beispiele, GetHtml Python Beispiele

Beispiel #1

0

Datei anzeigen

 def main(self):
     """
     生成测试报告主函数
     根据status yaml的文件来生成测试报告
     :return:
     """
     import GetHtml
     self.__analyze_log()
     result = self.__yaml_file(self.all_result_path, '.yaml')
     lst = []
     for case_name, confirm_status in result.items():
         case_name = str(case_name).split('.')[0]
         case_result = self.__open_yaml(confirm_status)
         case_img = self.__confirm_file(
             str(confirm_status).replace('status',
                                         'img').replace('yaml', 'png'))
         case_per = self.__confirm_file(
             str(confirm_status).replace('status',
                                         'per').replace('yaml', 'png'))
         case_log = self.__confirm_file(
             str(confirm_status).replace('status',
                                         'log').replace('yaml', 'log'))
         case_filter = self.__confirm_file(
             str(confirm_status).replace('status', 'log').replace(
                 'yaml', 'log').replace(case_name, case_name + 'filter'))
         if case_per is None:
             # 获取error图片
             ini = U.ConfigIni()
             case_per = ini.get_ini('test_case', 'error_img')
         lst.append(
             GetHtml.get_html_tr(case_name, case_result, case_img, case_per,
                                 case_log, case_filter))
     GetHtml.get_html(''.join(lst), self.__app_info(), self.__device_info(),
                      self.__test_case_execution_status(),
                      self.all_result_path)

Beispiel #2

0

Datei anzeigen

def main():
    startTime = time.clock()
    for i in range(1,100):
        timeStart = time.perf_counter()
        print(timeStart)
        url = "https://cd.fang.lianjia.com/loupan/pg{}/".format(i)
        # 获取url的html
        html = GetHtml.getHTMLTest(url)


        # 将html数据放入uinfo
        allhouser = parserList(html)
        # print(allhouser)
        for houser in allhouser:

            print(houser)
            # # 集中写入数据库
            # obj = OrmTest()
            # rest = obj.add_one(houser)
            # print(rest.id)
        print(time.perf_counter() - timeStart)



    endTime = time.clock()-startTime
    print("爬取完毕,爬取时长{}".format(endTime))

Beispiel #3

0

Datei anzeigen

def main():
    slistObj = StatusList()
    result = slistObj.test_search()
    for son in result:
        print(son.SonUrl)

        url = "https://cd.fang.lianjia.com/loupan/p_{}/huxingtu/".format(
            son.SonUrl)
        # 获取url的html
        html = GetHtml.getHTMLTest(url)

        houseList = []
        if len(html) > 1:
            print("请求到数据")

            # 解析html
            houseList = parserList(html)
        else:
            print("不存在户型数据")
        print(houseList)
        for house in houseList:
            house.append(son.SonUrl)
            house.append(son.name)
            # print(len(house))
            # print(house)
            # print(house[0])
            HouseObj = HouseList()
            HouseObj.add_one(house)

Beispiel #4

0

Datei anzeigen

def main():
    # startTime = time.clock()
    # for i in range(1,100):
    #     timeStart = time.perf_counter()
    #     print(timeStart)
    url = "https://cd.fang.lianjia.com/loupan/pg{}/".format(1)
    # 获取url的html
    html = GetHtml.getHTMLTest(url)
    # 将html数据放入uinfo
    allhouser = parserList(html)
    print(allhouser)
    for houser in allhouser:
        # print(len(houser[8]))
        print(houser)

Beispiel #5

0

Datei anzeigen

def main():
    startTime = time.clock()
    # 查询目录
    # catalog_obj = SQLCatalogOEM.OrmTest()
    # result = catalog_obj.test_search()
    staList_obj = StatusList()
    result = staList_obj.test_search()

    obj = Picture()
    for item in result:

        status = obj.selectBySonUrl(item.SonUrl)
        # print(status)
        if not status:
            # print("当前不存在,进行爬取")
            timeStart = time.perf_counter()
            # print(item.SonUrl,item.name)
            HtmlText = GetHtml.getHTMLTest(
                "https://cd.fang.lianjia.com/loupan/p_{}/xiangce/".format(
                    item.SonUrl))
            # 获取全部图片
            items = parserList(HtmlText)
            if len(items) > 0:
                # print(items)
                # print(len(items))
                print(item.SonUrl, item.name)
                images_obj = itemToNews(items)
                images_obj.name = item.name
                images_obj.SonUrl = item.SonUrl
                # images_obj.ImageBackup = ','.join(items)
                # print(images_obj.licenseImages)
                # 集中写入数据库
                obj = Picture()
                rest = obj.add_one(images_obj)
                print("第{}条爬取完成,用时 {}".format(rest.id,
                                              time.perf_counter() - timeStart))
        else:
            print("数据重复,重复Url:{}".format(item.SonUrl))

    endTime = time.clock() - startTime
    print("爬取完毕,爬取时长{}".format(endTime))

Beispiel #6

0

Datei anzeigen

def mainStatusList():
    startTime = time.clock()
    for i in range(1, 39):
        timeStart = time.perf_counter()
        url = "https://cd.fang.lianjia.com/loupan/nht{}pg{}/".format(5, i)
        # 获取url的html
        html = GetHtml.getHTMLTest(url)
        # 将html数据放入uinfo
        allhouser = parserUpdateList(html)
        # allhouser = parserList(html)
        print(allhouser)
        for houser in allhouser:
            # print(len(houser[8]))
            print(houser)
            # 集中写入数据库
            obj = OrmTest()
            rest = obj.add_one(houser)
            print(rest.id)
            print(time.perf_counter() - timeStart)
    endTime = time.clock() - startTime
    print("爬取完毕,爬取时长{}".format(endTime))

Beispiel #7

0

Datei anzeigen

def get_article(Blog,url):
    html = GetHtml.GetHtml(url, Blog)
    print(html.get_cnt())
    article_list = html.get_article_list()
    try:
        os.mkdir('source')
    except:
        pass
    for article in article_list:
        with open('source/' + article.title + '.md', 'w', encoding='utf-8') as w:
            text = "---\n" + 'title: ' + article.title + "\n" + 'date: ' + article.time + "\n"
            if article.category != '':
                text += 'categories:\n- ' + article.category + "\n"
            if article.tag != []:
                text += 'tags:\n'
                for t in article.tag:
                    text += '- ' + t + "\n"
            text += "---\n"
            Markdown = HtmlToMarkdown.HtmlToMarkdown(article.text)
            text += Markdown.get_string()
            w.write(text)

Beispiel #8

0

Datei anzeigen

print(CurtimeString)
##################打开数据库,创建urllist表
pDataBase = mysqlop.open(g_pDataBaseName)
mysqlop.test(pDataBase)
mysqlop.createtable(pDataBase, g_pTableName)
##################
#爬完首页之后所有URL都会写入到数据库testdb的urllist表中
#################删除输出目录
if os.path.exists(g_SaveDir) == True:
    shutil.rmtree(g_SaveDir)
os.mkdir(g_SaveDir)
#################
MainHTMLDir = g_SaveDir + '\\MainDir'
mysqlop.insert(pDataBase, 'urllist', 0, g_Web, 'NoClimb')
mysqlop.close(pDataBase)
GetHtml.ClimbHtml(g_Web, MainHTMLDir)

pDataBase = mysqlop.open(g_pDataBaseName)
urlnum = mysqlop.getitemnum(pDataBase, g_pTableName)
print('Total urlnum = %d' % urlnum)
#跳过MainDir.ID为0是MainDir,从表单ID为1开始爬.
g_CurUrlID = 1

while 1:
    if (g_CurUrlID < urlnum):
        #爬一次读取到的URL数量
        savepath = g_SaveDir + "\\out%d" % g_CurUrlID
        Cururl = mysqlop.read(pDataBase, g_pTableName, g_CurUrlID)
        print('----------------------------------------')
        #print('SavePath = [%s]'%savepath)
        print('Climbing [%s] ...' % Cururl)

Beispiel #9

0

Datei anzeigen

def maintest():
    url = "https://cd.fang.anjuke.com/loupan/all/p1/"
    # 获取url的html
    html = GetHtml.getHTMLTest(url)
    parserList(html)

Beispiel #10

0

Datei anzeigen

# 日付処理、ディレクトリ処理で利用
import datetime
import os
# 一定時間の待ちを入れるために利用
import time
import GetHtml

# 日付のディレクトリがなければ作成する
today = datetime.date.today()
dirname = today.strftime('%Y%m%d')
if not os.path.exists(dirname):
    os.mkdir(dirname)
#start_url = 'http://resource.pcassist.co.jp/sozai/IT56/chapter13/sample13_3_2.html'
start_url = 'http://shop.zhongyeyuan.com.cn/'

if __name__ == '__main__':
    print("----- Zac START -----")
    GetHtml.get_html(start_url, dirname)
    print("----- Zac END -----")

Beispiel #11

0

Datei anzeigen

def getOtherText(projectName, id):
    global numberall
    startTime = time.perf_counter()
    # print("房屋id>>>>{}".format(id))
    # 获取房屋的详细数据
    otherHtml = GetHtml.getHTMLTest(
        "https://cd.fang.lianjia.com/loupan/p_{}/?fb_expo_id=".format(
            projectName, id))
    # 将房屋的详细数据进行解析
    resultSoup = bs4.BeautifulSoup(otherHtml, "html.parser")
    # 获取详细数据的div标签
    allResultTag = resultSoup.find(name='div', attrs={'class': 'box-loupan'})
    # print("房屋详情".center(40, "-"))
    otherlist = []
    for tagitem in allResultTag:
        if isinstance(tagitem, bs4.element.Tag):
            for tag in tagitem:
                # 内容开始
                if isinstance(tag, bs4.element.Tag):
                    # print("{} >>> {}".format(tag.string, type(tag)))
                    for t1 in tag:
                        if isinstance(t1, bs4.element.Tag):
                            for t2 in t1:
                                if isinstance(t2, bs4.element.Tag):
                                    for t3 in t2:
                                        if isinstance(t3, bs4.element.Tag):
                                            for t4 in t3:
                                                # print(type(t4))
                                                # print("{}----{}".format(len(t4.string), t4.replace(' ', '').replace('\n', '').replace(';', '')))
                                                # print(t4.string.strip(),end=" ")
                                                otherlist.append(
                                                    t4.replace(' ',
                                                               '').replace(
                                                                   '\n',
                                                                   '').replace(
                                                                       ';',
                                                                       ''))

                                        elif isinstance(
                                                t3,
                                                bs4.element.NavigableString):
                                            if len(t3.string) > 1:
                                                # 有效数据
                                                # print("{}---{}".format(len(t3.string), t3.string.replace(' ', '').replace('\n', '').replace(';', '')))
                                                # print(t3.string.strip(),end=" ")
                                                otherlist.append(
                                                    t3.string.replace(
                                                        ' ', '').replace(
                                                            '\n', '').replace(
                                                                ';', ''))

                        elif isinstance(t1, bs4.element.NavigableString):
                            if len(t1.string) > 1:
                                # 有效数据
                                # print("{}-{}".format(len(t1.string),t1.string.replace(' ', '').replace('\n', '').replace(';', '')))
                                # print(t1.string.strip(),end=" ")
                                otherlist.append(
                                    t1.string.replace(' ', '').replace(
                                        '\n', '').replace(';', ''))
    print("第{}条,耗时{}秒".format(numberall, time.perf_counter() - startTime))

    return otherlist

Beispiel #12

0

Datei anzeigen

def test():
    HtmlText = GetHtml.getHTMLTest(
        "https://cd.fang.lianjia.com/loupan/p_{}/xiangce/".format(item.SonUrl))
    # 获取全部图片
    items = parserList(HtmlText)