Python HtmlAnalyse.get_bs_contents 예제들, Lib.NetCrawl.HtmlAnalyse.HtmlAnalyse.get_bs_contents Python 예제들

예제 #1

0

파일 보기

    def get_csv_categories(self):
        while True:
            try:
                html_analyse = HtmlAnalyse(
                    "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/",
                    proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()
        first_categories = bs_content.find_all(
            name="div", attrs={"class": "horizontalMenu sectionUp"})
        third_categories = []
        for first_category in first_categories:
            first_category_name = first_category.span.text
            ul_tags = first_category.find_all(name="ul",
                                              attrs={"class": "column1"})
            for ul_tag in ul_tags:
                li_tags = ul_tag.find_all(name="li")
                for li_tag in li_tags:
                    second_category_url = Rs_Pre_Url + li_tag.a.get("href")
                    second_category_name = li_tag.a.text.replace(
                        li_tag.a.span.text, "").strip()
                    while True:
                        try:
                            html_analyse = HtmlAnalyse(second_category_url,
                                                       proxy=self.proxy_ip)
                            bs_content = html_analyse.get_bs_contents()
                            ul_tag = bs_content.find(
                                name="ul", attrs={"class": "brcategories"})

                            break
                        except Exception as e:
                            print(sys._getframe().f_code.co_name, e,
                                  second_category_url)
                            self.proxy_pool.remove(self.proxy_ip)
                            self.proxy_ip = self.proxy_pool.get()
                    if ul_tag:
                        third_category_tags = ul_tag.find_all(
                            name="div", attrs={"class": "rsGARealEstate"})
                        for third_category_tag in third_category_tags:
                            third_category_name = third_category_tag.a.text
                            third_category_url = Rs_Pre_Url + third_category_tag.a.get(
                                "href")
                            third_category = (first_category_name,
                                              second_category_name,
                                              third_category_name,
                                              third_category_url)
                            print(third_category)
                            third_categories.append(third_category)
                    else:
                        third_category = (first_category_name,
                                          second_category_name,
                                          second_category_name,
                                          second_category_url)
                        print(third_category)
                        third_categories.append(third_category)
        return third_categories

예제 #2

0

파일 보기

    def get_suppliers(self):
        def thread_go(page_url):
            html_analyse = HtmlAnalyse(page_url)
            while True:
                try:
                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(e)
            company_tags = bs_content.find_all(name="a",
                                               attrs={
                                                   "target": "_blank",
                                                   "href": re.compile(r"/\d+")
                                               })
            corporations = []
            for company_tag in company_tags:
                corporation = company_tag.text.strip()
                corporation_dict = {
                    "corporation": corporation,
                    "province_url": city_url,
                    "page_url": page_url,
                    "状态": "未完成",
                    "from": "99114"
                }
                corporations.append(corporation)
                col = self.db.All_Company_Name
                col.insert(corporation_dict)
            print(corporations)
            return corporations

        html_analyse = HtmlAnalyse("http://shop.99114.com/")
        bs_content = html_analyse.get_bs_contents()
        all_city_tags = bs_content.find_all(
            name="a",
            attrs={"href": re.compile(r"http://shop\.99114\.com/list/area")})
        for city_tag in all_city_tags:
            city_url = city_tag.get("href")
            html_analyse = HtmlAnalyse(city_url)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find_all(
                name="a", attrs={"href": re.compile(r"/list/area/")})[-2]
            page_count = int(page_tag.text.replace(",", ""))
            page_urls = map(
                lambda page_num: city_url[:-1] + str(page_num) + ".html",
                range(1, page_count + 1))

            # for page_url in page_urls:
            #     thread_go(page_url)

            threading_pool = ThreadingPool(12)
            threading_pool.multi_process(thread_go, page_urls)

예제 #3

0

파일 보기

파일: getCategoryUrl.py 프로젝트: RoyalClown/MyPython

    def get_categories(self):
        main_url = "http://www.chip1stop.com/web/CHN/zh/dispClassSearchTop.do"
        self.proxy_ip = self.proxy_pool.get()
        while True:
            try:
                html_analsye = HtmlAnalyse(main_url, proxy=self.proxy_ip)
                bs_content = html_analsye.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_pool.remove(self.proxy_ip)
                self.proxy_ip = self.proxy_pool.get()

        dl_tags = bs_content.find_all(
            name="dl", attrs={"class": "categoryListDl clearfix"})

        second_categories = []
        for dl_tag in dl_tags:
            first_directory_name = dl_tag.dt.text
            second_directory_tags = dl_tag.find_all(name="dd")
            for second_directory_tag in second_directory_tags:
                rough_second_directory_name = second_directory_tag.text
                second_directory_name = re.match(
                    r"(.*?)\[", rough_second_directory_name).group(1).strip()
                second_directory_url = "http://www.chip1stop.com/web/CHN/zh" + second_directory_tag.span.a.get(
                    "href")[1:]
                second_directory = (first_directory_name,
                                    second_directory_name,
                                    second_directory_url)
                second_categories.append(second_directory)
        return second_categories

예제 #4

0

파일 보기

 def thread_go(page_url):
     html_analyse = HtmlAnalyse(page_url)
     while True:
         try:
             bs_content = html_analyse.get_bs_contents()
             break
         except Exception as e:
             print(e)
     company_tags = bs_content.find_all(name="a",
                                        attrs={
                                            "target": "_blank",
                                            "href": re.compile(r"/\d+")
                                        })
     corporations = []
     for company_tag in company_tags:
         corporation = company_tag.text.strip()
         corporation_dict = {
             "corporation": corporation,
             "province_url": city_url,
             "page_url": page_url,
             "状态": "未完成",
             "from": "99114"
         }
         corporations.append(corporation)
         col = self.db.All_Company_Name
         col.insert(corporation_dict)
     print(corporations)
     return corporations

예제 #5

0

파일 보기

파일: listProducts.py 프로젝트: RoyalClown/MyPython

    def get_series_urls(self, list_url):
        def get_pages_urls(url):
            html_analyse = HtmlAnalyse(url)
            bs_contents = html_analyse.get_bs_contents()
            page_urls = []
            page = len(
                bs_contents.find_all(name="li", attrs={"class": "pager-item"
                                                       })) + 1
            for i in range(page):
                page_url = url.split('#')[0] + "?page=" + str(
                    i) + Third_Suffix_Url
                page_urls.append(page_url)
            return page_urls

        series_urls = []
        page_urls = get_pages_urls(list_url)
        if page_urls is None:
            return None
        for page_url in page_urls:
            html_analyse = HtmlAnalyse(page_url)
            bs_contents = html_analyse.get_bs_contents()
            tags = bs_contents.find_all(
                name='tr',
                attrs={"class": re.compile(u"ecatalog-series-table")})
            for tag in tags:
                try:
                    href = tag.find_all(name="td")[0].a.get("href")
                    m_url = Panasonic_Pre_Url + href + '&limit=100'
                    series_urls.append(m_url)
                except Exception as e:
                    href = tag.find_all(name="td")[1].a.get("href")
                    m_url = Panasonic_Pre_Url + href + '&limit=100'
                    series_urls.append(m_url)

        return series_urls

예제 #6

0

파일 보기

파일: soudh.py 프로젝트: RoyalClown/MyPython

 def thread_go(page_url):
     html_analyse = HtmlAnalyse(page_url)
     while True:
         try:
             bs_content = html_analyse.get_bs_contents()
             break
         except Exception as e:
             print(e)
     ul_tag = bs_content.find(name="div",
                              attrs={"class": "leftbox comlist"})
     li_tags = ul_tag.find_all(name="li")
     corporations = []
     for li_tag in li_tags:
         corporation = li_tag.text.strip()
         corporation_dict = {
             "corporation": corporation,
             "province_url": province_url,
             "page_url": page_url,
             "状态": "未完成"
         }
         corporations.append(corporation)
         col = self.db.All_Company_Name
         col.insert(corporation_dict)
     print(corporations)
     return corporations

예제 #7

0

파일 보기

파일: suppliersList.py 프로젝트: RoyalClown/MyPython

    def get_supplier(self, url):
        html_analyse = HtmlAnalyse(url)
        bs_content = html_analyse.get_bs_contents()
        ul_tag = bs_content.find(name="ul",
                                 attrs={"class": "txl_content_con_L"})
        supplier_name = ul_tag.h1.text.strip()

        supplier_place = ul_tag.li.text.split("：", 2)[1].replace("\n",
                                                                 " ").strip()
        supplier_contact = ul_tag.find(name="li",
                                       text=re.compile(r'联系人：')).text.split(
                                           "：", 2)[1].strip()
        supplier_fax = ul_tag.find(
            name="li", text=re.compile(r'传真：')).text.split("：", 2)[1].strip()
        supplier_phone = ul_tag.find(name="li",
                                     text=re.compile(r'公司	联系电话：')).text.split(
                                         "：", 2)[1].strip()
        supplier_mobile = ul_tag.find(name="li",
                                      text=re.compile(r'手机号码：')).text.split(
                                          "：", 2)[1].strip()
        supplier_address = ul_tag.find(name="li",
                                       text=re.compile(r'联系地址：')).text.split(
                                           "：", 2)[1].strip()
        line = (supplier_name, supplier_place, supplier_contact, supplier_fax,
                supplier_phone, supplier_mobile, supplier_address)
        print(line)
        return line

예제 #8

0

파일 보기

파일: listProducts.py 프로젝트: RoyalClown/MyPython

    def get_products_list(self, series_url):
        def get_pages_urls(url):
            html_analyse = HtmlAnalyse(url, is_proxy=True)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find(name="a",
                                       attrs={"title": "到最后一页"},
                                       text="末页 »")
            try:
                rough_page = page_tag.get("href")
                page = re.match(r"/ea/products/.*?page=(\d+)&reset=1",
                                rough_page).group(1)
            except:
                page = 0
            page_urls = []
            for i in range(int(page) + 1):
                page_url = url + "&page=" + str(i)
                page_urls.append(page_url)
            return page_urls

        product_lists = []
        page_urls = get_pages_urls(series_url)
        if page_urls is None:
            return None
        for page_url in page_urls[:]:
            html_analyse = HtmlAnalyse(page_url)
            bs_contents = html_analyse.get_bs_contents()
            product_list = bs_contents.find_all(
                name='tr', attrs={"class":
                                  re.compile(u"(^odd$)|(^even$)")})[1:]
            if not product_list:
                continue
            product_lists += product_list
        return product_lists

예제 #9

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def get_attach(detail_url):
     html_analyse = HtmlAnalyse(detail_url)
     bs_content = html_analyse.get_bs_contents()
     rough_attach = bs_content.find(name="a", id="displayedPath")
     try:
         attach = ST_Pre_Url + rough_attach.get("href")
     except:
         attach = ""
     return attach

예제 #10

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def get_product_list(self):
     html_analyse = HtmlAnalyse(self.url)
     bs_contents = html_analyse.get_bs_contents()
     rough_products = bs_contents.find_all(name="tr", attrs={"class": "products"})
     products_urls = []
     for rough_product in rough_products:
         url = Infineon_Pre_Url + rough_product.td.a.get("href")
         products_urls.append(url)
     return products_urls

예제 #11

0

파일 보기

 def get_pages_urls(url):
     html_analyse = HtmlAnalyse(url)
     bs_contents = html_analyse.get_bs_contents()
     page_urls = []
     page = len(bs_contents.find_all(name="li", attrs={"class": "pager-item"})) + 1
     for i in range(page):
         page_url = url.split('#')[0] + "?page=" + str(i) + Third_Suffix_Url
         page_urls.append(page_url)
     return page_urls

예제 #12

0

파일 보기

 def get_img(url):
     html_analyse = HtmlAnalyse(url)
     bs_content = html_analyse.get_bs_contents()
     rough_img = bs_content.find(name="img", id="productImageId")
     try:
         img = rough_img.get("src")
     except:
         print("未获取图片", url)
         img = ''
     return img

예제 #13

0

파일 보기

파일: suppliersList.py 프로젝트: RoyalClown/MyPython

 def get_supplier_urls(self, url):
     html_analyse = HtmlAnalyse(url)
     bs_content = html_analyse.get_bs_contents()
     ul_tags = bs_content.find_all(name="ul",
                                   attrs={"class": "sheng_weizhi_lb"})
     urls = []
     for ul_tag in ul_tags:
         url = "http://book.youboy.com" + ul_tag.div.strong.a.get("href")
         urls.append(url)
     return urls

예제 #14

0

파일 보기

    def get_code_urls(self, series_url):
        def get_pages_urls(url):
            html_analyse = HtmlAnalyse(url, is_proxy=True)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find(name="a",
                                       attrs={"title": "到最后一页"},
                                       text="末页 »")
            try:
                rough_page = page_tag.get("href")
                page = re.match(r"/ea/products/.*?page=(\d+)&reset=1",
                                rough_page).group(1)
            except:
                page = 0
            page_urls = []
            for i in range(int(page) + 1):
                page_url = url + "&page=" + str(i)
                page_urls.append(page_url)
            return page_urls

        product_urls = []
        page_urls = get_pages_urls(series_url)
        if page_urls is None:
            return None
        for page_url in page_urls[:]:
            html_analyse = HtmlAnalyse(page_url)
            bs_contents = html_analyse.get_bs_contents()
            lists = bs_contents.find_all(
                name='tr', attrs={"class": re.compile(u"(^odd$)|(^even$)")})
            if not lists:
                continue
            for list in lists[1:]:
                try:
                    model = list.td.a
                    code = model.text
                except:
                    break

                # *******去重*******
                orcl_con = OracleConnection()
                cursor = orcl_con.conn.cursor()
                cursor.execute(
                    "select cc_id from product$component_crawl where cc_code='{}'"
                    .format(code))
                data = cursor.fetchone()
                if data:
                    print("repeat")
                    continue
                cursor.close()
                orcl_con.conn.close()
                # *******结束*******

                href = model.get("href")
                url = Panasonic_Pre_Url + href
                product_urls.append(url)
        return product_urls

예제 #15

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def get_pdf(url):
     html_analyse = HtmlAnalyse(url)
     bs_content = html_analyse.get_bs_contents()
     pdf = bs_content.find(
         name="a",
         attrs={"href": re.compile(r'/ac/c_download/.*?\.pdf')})
     if pdf:
         pdf_url = pdf.get("href") + "&via=ok"
     else:
         pdf_url = ''
     return pdf_url

예제 #16

0

파일 보기

 def get_product_url(self):
     html_analyse = HtmlAnalyse(self.url)
     bs_content = html_analyse.get_bs_contents()
     rough_products = bs_content.find_all(name="div", attrs={"class": "section-devices"})
     img = Atmel_Pre_Url + bs_content.find(name="img", attrs={"src": re.compile(r'/Images/.*?\.jpg')}).get("src")
     imgs_urls = []
     for rough_product in rough_products:
         product_url = Atmel_Pre_Url + rough_product.a.get("href")
         img_url = (img, product_url)
         imgs_urls.append(img_url)
     return imgs_urls

예제 #17

0

파일 보기

def belling(url):
    html_analyse = HtmlAnalyse(url)
    bs_content = html_analyse.get_bs_contents()

    pdf_tags = bs_content.find_all(name="a",
                                   attrs={"href": re.compile(r".*?\.pdf$")})
    hrefs = []
    for pdf_tag in pdf_tags:
        href = pdf_tag.get("href")
        print(href)
        hrefs.append(href)
    return hrefs

예제 #18

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def get_pdf(url):
     html_analyse = HtmlAnalyse(url)
     bs_content = html_analyse.get_bs_contents()
     pdf = bs_content.find(
         name="a",
         attrs={
             "href":
             re.compile(
                 r'/ac/c_download/control/relay/photomos/catalog/semi_cn_'
             )
         })
     pdf_url = pdf.get("href") + "&via=ok"
     return pdf_url

예제 #19

0

파일 보기

 def get_pages_urls(url):
     html_analyse = HtmlAnalyse(url, is_proxy=True)
     bs_content = html_analyse.get_bs_contents()
     page_tag = bs_content.find(name="a", attrs={"title": "到最后一页"}, text="末页 »")
     try:
         rough_page = page_tag.get("href")
         page = re.match(r"/ea/products/.*?page=(\d+)&reset=1", rough_page).group(1)
     except:
         page = 0
     page_urls = []
     for i in range(int(page) + 1):
         page_url = url + "&page=" + str(i)
         page_urls.append(page_url)
     return page_urls

예제 #20

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def get_all_content(self):
     many_contents = []
     for series_url in self.series_urls:
         if series_url == "http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=search":
             session = requests.session()
             session.headers.update({
                 'Connection': 'keep-alive',
                 'Accept':
                 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                 'Accept-Language': 'zh-CN,zh;q=0.8',
                 'Accept-Encoding': 'gzip, deflate',
                 'Cache-Control': 'max-age=0',
                 'Content-Type': 'application/x-www-form-urlencoded',
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36',
                 'X-Requested-With': 'XMLHttpRequest',
                 'Host': 'device.panasonic.cn',
                 'Origin': 'http://device.panasonic.cn',
                 'Referer':
                 'http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=search',
                 'Upgrade-Insecure-Requests': '1'
             })
             form = {
                 'pagecnt': 1,
                 'maxrows': 20,
                 'topage': 2,
                 'VAL_3_3286': '',
                 'VAL_3_3433': '',
                 'VAL_3_3287': '',
                 'VAL_3_3436': '',
                 'part_no': ''
             }
             content0 = session.get(series_url).text
             bs_contents0 = BeautifulSoup(content0, "html.parser")
             many_contents.append(bs_contents0)
             content1 = session.post(
                 "http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=move",
                 data=form).text
             bs_contents1 = BeautifulSoup(content1, "html.parser")
             many_contents.append(bs_contents1)
         else:
             html_analyse = HtmlAnalyse(series_url)
             bs_contents = html_analyse.get_bs_contents()
             many_contents.append(bs_contents)
     return many_contents

예제 #21

0

파일 보기

파일: xml_analyse.py 프로젝트: RoyalClown/MyPython

 def thread_go(self, parameters):
     url = parameters
     html_analyse = HtmlAnalyse(url)
     bs_content = html_analyse.get_bs_contents()
     tr_tags = bs_content.find(name="tbody").find_all(name="tr")
     for tr_tag in tr_tags:
         td_tags = tr_tag.find_all("td")
         property_name = td_tags[0].text.strip()
         value_tag = td_tags[1]
         if property_name == "料号":
             cc_code = value_tag.text.strip()
         elif property_name == "品牌":
             cc_brandname = value_tag.text.strip()
         elif property_name == "规格书":
             cc_attach = value_tag.get("href")
         property_value = td_tags[1]
     component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url,
                  cc_attach, cc_img)

예제 #22

0

파일 보기

파일: soudh.py 프로젝트: RoyalClown/MyPython

    def get_suppliers(self):
        def thread_go(page_url):
            html_analyse = HtmlAnalyse(page_url)
            while True:
                try:
                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(e)
            ul_tag = bs_content.find(name="div",
                                     attrs={"class": "leftbox comlist"})
            li_tags = ul_tag.find_all(name="li")
            corporations = []
            for li_tag in li_tags:
                corporation = li_tag.text.strip()
                corporation_dict = {
                    "corporation": corporation,
                    "province_url": province_url,
                    "page_url": page_url,
                    "状态": "未完成"
                }
                corporations.append(corporation)
                col = self.db.All_Company_Name
                col.insert(corporation_dict)
            print(corporations)
            return corporations

        for province_id in range(1, 36):
            province_url = "http://www.soudh.com/province-" + str(
                province_id) + ".html"
            html_analyse = HtmlAnalyse(province_url)
            bs_content = html_analyse.get_bs_contents()
            page_tag = bs_content.find(name="span", text=re.compile(r'当前为'))
            page_count = int(re.match(r'.*?共(\d+)页', page_tag.text).group(1))
            page_urls = map(
                lambda page_num: province_url[:-5] + "-" + str(page_num) +
                ".html", range(1, page_count + 1))
            #
            # for page_url in page_urls:
            #     thread_go(page_url)

            threading_pool = ThreadingPool()
            threading_pool.multi_thread(thread_go, page_urls)

예제 #23

0

파일 보기

파일: getUrls.py 프로젝트: RoyalClown/MyPython

    def get_first_classes(
            self, url="http://www.mlcc1.com/search_simplex.html?searchkey="):
        html_analyse = HtmlAnalyse(url)
        bs_content = html_analyse.get_bs_contents()

        first_tag_names = bs_content.find_all(name="p",
                                              attrs={"class": "down"})
        first_class_contents = bs_content.find_all(
            name="ul", attrs={"class": re.compile(r'mlcc_\d+_list')})

        first_classs = []
        for first_tag_name, first_class_content in zip(first_tag_names,
                                                       first_class_contents):
            first_class_name = re.match(r'(.*?) （.*?',
                                        first_tag_name.text).group(1)
            # first_class_name = first_tag_name.text.replace(' ', '')
            first_class = (first_class_name, first_class_content)
            first_classs.append(first_class)

        return first_classs

예제 #24

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def __init__(self, product_url):
     self.url = product_url
     html_analyse = HtmlAnalyse(self.url)
     self.bs_content = html_analyse.get_bs_contents()

예제 #25

0

파일 보기

    def get_page_url(self, second_category):
        first_category_name, second_category_name, second_category_url = second_category
        while True:
            try:
                html_analyse = HtmlAnalyse(second_category_url,
                                           proxy=self.proxy_ip)
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()

        ul_tag = bs_content.find(name="ul", attrs={"class": "brcategories"})
        third_category_tags = ul_tag.find_all(
            name="div", attrs={"class": "rsGARealEstate"})
        for third_category_tag in third_category_tags:
            third_category_name = third_category_tag.a.text
            third_category_url = Rs_Pre_Url + third_category_tag.a.get("href")

            while True:
                try:
                    html_analyse = HtmlAnalyse(third_category_url,
                                               proxy=self.proxy_ip)

                    bs_content = html_analyse.get_bs_contents()
                    break
                except Exception as e:
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
            try:
                page_tag = bs_content.find(name="div",
                                           attrs={
                                               "class": "viewProdDiv"
                                           }).text
            except Exception as e:
                print(third_category_url, e, "找不到page_tag")
                continue
            flag = re.match(r".*?共(.*?)个", page_tag)
            page_count = int(int(flag.group(1).strip()) / 20 + 1)
            for page_num in range(int(page_count)):
                page_url = third_category_url + "?pn=" + str(page_num + 1)
                while True:
                    try:

                        html_analyse = HtmlAnalyse(page_url,
                                                   proxy=self.proxy_ip)
                        bs_content = html_analyse.get_bs_contents()
                        break
                    except Exception as e:
                        print(sys._getframe().f_code.co_name, e)
                        self.proxy_ip = self.proxy_pool.get()
                component_url_tags = bs_content.find_all(
                    name="a", attrs={"class": "tnProdDesc"})
                page_attributes = []
                for component_url_tag in component_url_tags:
                    component_url = Rs_Pre_Url + component_url_tag.get("href")
                    union_category_name = second_category_name + "---" + third_category_name
                    page_attribute = (first_category_name, union_category_name,
                                      component_url)
                    page_attributes.append(page_attribute)
                #
                threadingpool = ThreadingPool(4)
                threadingpool.multi_process(self.thread_go, page_attributes)

                # for page_attribute in page_attributes:
                #     self.thread_go(page_attribute)

            continue

예제 #26

0

파일 보기

    def thread_go(self, page_attributes):
        cc_unit, cc_kiname, cc_url = page_attributes
        html_analyse = HtmlAnalyse(cc_url)
        while True:
            try:
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)

        brand_tag = bs_content.find(name="span", attrs={"itemprop": "brand"})
        name_tag = bs_content.find(name="span", attrs={"itemprop": "mpn"})

        if not brand_tag or not name_tag:
            return
        cc_brandname = brand_tag.text.strip()

        cc_code = name_tag.text.strip()

        img_tag = bs_content.find(name="img", attrs={"itemprop": "image"})
        if not img_tag:
            cc_img = ""
        else:
            cc_img = Rs_Pre_Url + img_tag.get("src")

        attach_tag = bs_content.find(
            name="a",
            attrs={"onclick": re.compile(r"window\.open\('http://docs")})
        if not attach_tag:
            cc_attach = ""
        else:
            attach_name = attach_tag.get("onclick")
            try:
                cc_attach = re.match(r"window\.open\('(.*?\.pdf)'\)",
                                     attach_name).group(1)
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                cc_attach = ""

        component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url,
                     cc_attach, cc_img)

        # 器件属性
        while True:
            try:
                orcl_conn = OracleSave(1000005)
                orcl_conn.component_insert(component)
                component_properties = []
                tr_tags = bs_content.find_all(
                    name="tr", attrs={"class": re.compile(r"dr-table-row")})
                for tr_tag in tr_tags:
                    td_tags = tr_tag.find_all(name="td")
                    parameter_name = td_tags[1].text
                    parameter_value = td_tags[2].text
                    component_property = (parameter_name, parameter_value)
                    component_properties.append(component_property)

                    orcl_conn.properties_insert(component_property)
                orcl_conn.commit()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
            finally:
                orcl_conn.conn.close()

예제 #27

0

파일 보기

"""
    @description:   
    @author:        RoyalClown
    @date:          2017/3/20
"""
from Lib.NetCrawl.HtmlAnalyse import HtmlAnalyse

html_analyse = HtmlAnalyse("http://china.rs-online.com/web/p/igbt-transistors/7965064/")
bs_content = html_analyse.get_bs_contents()
print(bs_content)

예제 #28

0

파일 보기

 def __init__(self, img_url):
     self.img, self.url = img_url
     html_analyse = HtmlAnalyse(self.url)
     self.bs_content = html_analyse.get_bs_contents()

예제 #29

0

파일 보기

파일: detailAttributes.py 프로젝트: RoyalClown/MyPython

 def __init__(self, url="http://www.azurewave.com/product_a001_1.asp"):
     self.url = url
     html_analyse = HtmlAnalyse(self.url)
     self.bs_content = html_analyse.get_bs_contents()

예제 #30

0

파일 보기

파일: productList.py 프로젝트: RoyalClown/MyPython

 def __init__(self, url, code):
     self.url = url
     self.code = code
     html_analyse = HtmlAnalyse(self.url)
     self.bs_content = html_analyse.get_bs_contents()