Ejemplo n.º 1
0
    def thread_go(self, page_url):
        many_components_properties = self.detail.get_page_components(page_url)
        for component_properties in many_components_properties:
            component, single_properties = component_properties
            while True:
                try:
                    orcl_conn = OracleSave(1111111)

                    orcl_conn.component_insert(component)

                    for properties in single_properties:
                        orcl_conn.properties_insert(properties)
                    orcl_conn.commit()
                    orcl_conn.conn.close()
                    break
                except Exception as e:
                    print(e, "存储错误")
        return
Ejemplo n.º 2
0
    def page_thread_go(self, page_category):
        first_category_name, second_category_name, page_url = page_category
        count = 0
        my_headers = {'Connection': 'Keep-Alive',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate, sdch',
                      "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36',
                      "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", }
        while True:
            try:
                self.my_session.headers.update(my_headers)
                self.my_session.proxies.update(self.proxy_ip)
                res = self.my_session.get(page_url, timeout=20)
                if res.status_code != 200:
                    print(res.status_code)
                    self.proxy_pool.remove(self.proxy_ip)
                    self.proxy_ip = self.proxy_pool.get()
                    continue
                bs_content = BeautifulSoup(res.content, "lxml")
                component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all(
                    name="tr", attrs={"class": re.compile(r"SearchResult")})

                break
            except Exception as e:
                count += 1
                print(sys._getframe().f_code.co_name, e)
                self.proxy_ip = self.proxy_pool.get()
                if count > 20:
                    self.proxy_pool._refresh()

        table_header_tags = component_tags[0].find_all(name="th")[11:]

        for component_tag in component_tags[2:]:
            td_tags = component_tag.find_all(name="td")
            try:
                rough_component_code = td_tags[3].text.strip()
                no = len(rough_component_code)
                for num, code_str in enumerate(rough_component_code):
                    if code_str == "\n":
                        no = num
                        break

                component_code = rough_component_code[:no]
            except Exception as e:
                print("component code is None", e)
                continue
            try:
                component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/",
                                                                                                      "/images/")
            except:
                component_img = ""
            try:
                rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表"))
                component_attach = rough_attach.get("href")
                if "http" not in component_attach:
                    component_attach = ""
            except Exception as e:
                print("pdf is none", page_url, component_code)
                component_attach = ""
                # if not component_img:
                #     continue
            try:
                component_brand = td_tags[4].a.text
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                continue

            component = (
                component_code, component_brand, first_category_name, second_category_name, page_url,
                component_attach,
                component_img)
            count = 0
            try:
                rohs_tag = td_tags[10]
            except Exception as e:
                print(e)
                continue

            property_key_values = []
            if rohs_tag.text == "详细信息":
                key_value = ("RoHS", "Yes")
                property_key_values.append(key_value)

            len_heads = len(table_header_tags)
            if len_heads:
                for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]):
                    property_name = name_tag.text.strip()

                    property_value = property_tag.text.strip()
                    key_value = (property_name, property_value)
                    property_key_values.append(key_value)

            while True:
                try:
                    orcl_conn = OracleSave(1000002)
                    orcl_conn.component_insert(component)
                    for key_value in property_key_values:
                        orcl_conn.properties_insert(key_value)
                    orcl_conn.commit()
                    orcl_conn.conn.close()

                    break
                except Exception as e:
                    print("database save exception", e)
                    count += 1
Ejemplo n.º 3
0
        def thread_go(page_no):
            print("Page:", page_no)

            page_parts = range(0, 25, 5)
            for page_part in page_parts:
                print("Part:", page_part)
                # def thread_go(page_part):
                complete_form_data['nextSearchIndex'] = page_part
                complete_form_data['dispPageNo'] = page_no
                complete_form_data['type'] = "page"
                detail_url = second_category_url + "&dispPageNo=%d" % page_no

                while True:
                    try:
                        my_session.cookies.update(request_cookie)
                        res = my_session.post(detail_url,
                                              data=complete_form_data,
                                              proxies=self.proxy_ip,
                                              timeout=20)
                        print(res.status_code)
                        if res.status_code == 200:
                            content = res.content.decode()
                            bs_content = BeautifulSoup(content, "lxml")
                            tr_tags = bs_content.find_all(name="tr")[1:]
                            if tr_tags:
                                break
                        else:
                            self.proxy_pool.remove(self.proxy_ip)
                            self.proxy_ip = self.proxy_pool.get()
                    except Exception as e:
                        print(sys._getframe().f_code.co_name, e)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                tr_tags = bs_content.find_all(name="tr")[1:]
                if not tr_tags:
                    continue
                # 数据库连接
                orcl_conn = OracleSave(1000001)

                for tr_tag in tr_tags:
                    try:
                        code = tr_tag.td.find(name="p",
                                              attrs={
                                                  "class": "text14pt2 bold"
                                              }).text.strip()
                    except Exception as e:
                        print(e)
                        continue

                    chip1stop_code = tr_tag.td.find(name="p",
                                                    attrs={
                                                        "class": "text10"
                                                    }).text.strip()
                    print(chip1stop_code)
                    maker = tr_tag.td.find(name="p",
                                           attrs={
                                               "class": "text10 wordBreak"
                                           }).text.strip()
                    pdf_url = tr_tag.find(
                        name="a",
                        attrs={
                            "href":
                            re.compile(
                                r"http://download\.siliconexpert\.com/pdfs")
                        })
                    if pdf_url:
                        pdf_url = pdf_url.get("href")

                    component = (code, maker, first_category_name,
                                 second_category_name, second_category_url,
                                 pdf_url, None)
                    orcl_conn.component_insert(component)

                    property_tags = tr_tag.find_all(name="td")[6:-1]
                    for property_name, property_tag in zip(
                            property_names, property_tags):
                        if property_name == '购买/询价':
                            continue
                        property_value = property_tag.text
                        if property_value:
                            property_value = property_value.strip()
                        single_property = (property_name, property_value)
                        orcl_conn.properties_insert(single_property)

                orcl_conn.commit()
                orcl_conn.conn.close()
Ejemplo n.º 4
0
    def thread_go(self, page_attributes):
        cc_unit, cc_kiname, cc_url = page_attributes
        html_analyse = HtmlAnalyse(cc_url)
        while True:
            try:
                bs_content = html_analyse.get_bs_contents()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)

        brand_tag = bs_content.find(name="span", attrs={"itemprop": "brand"})
        name_tag = bs_content.find(name="span", attrs={"itemprop": "mpn"})

        if not brand_tag or not name_tag:
            return
        cc_brandname = brand_tag.text.strip()

        cc_code = name_tag.text.strip()

        img_tag = bs_content.find(name="img", attrs={"itemprop": "image"})
        if not img_tag:
            cc_img = ""
        else:
            cc_img = Rs_Pre_Url + img_tag.get("src")

        attach_tag = bs_content.find(
            name="a",
            attrs={"onclick": re.compile(r"window\.open\('http://docs")})
        if not attach_tag:
            cc_attach = ""
        else:
            attach_name = attach_tag.get("onclick")
            try:
                cc_attach = re.match(r"window\.open\('(.*?\.pdf)'\)",
                                     attach_name).group(1)
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
                cc_attach = ""

        component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url,
                     cc_attach, cc_img)

        # 器件属性
        while True:
            try:
                orcl_conn = OracleSave(1000005)
                orcl_conn.component_insert(component)
                component_properties = []
                tr_tags = bs_content.find_all(
                    name="tr", attrs={"class": re.compile(r"dr-table-row")})
                for tr_tag in tr_tags:
                    td_tags = tr_tag.find_all(name="td")
                    parameter_name = td_tags[1].text
                    parameter_value = td_tags[2].text
                    component_property = (parameter_name, parameter_value)
                    component_properties.append(component_property)

                    orcl_conn.properties_insert(component_property)
                orcl_conn.commit()
                break
            except Exception as e:
                print(sys._getframe().f_code.co_name, e)
            finally:
                orcl_conn.conn.close()
Ejemplo n.º 5
0
        def extra_thread(page_num):
            page_url = url + "/prl/results/" + str(page_num)
            count = 0
            while True:
                try:
                    self.my_session.headers.update(my_headers)
                    self.my_session.proxies.update(self.proxy_ip)
                    res = self.my_session.get(page_url, timeout=20)
                    if res.status_code != 200:
                        print(res.status_code)
                        self.proxy_pool.remove(self.proxy_ip)
                        self.proxy_ip = self.proxy_pool.get()
                        continue
                    bs_content = BeautifulSoup(res.content, "lxml")
                    component_tags = bs_content.find(
                        name="table", id="sProdList").tbody.find_all(name="tr")
                    break
                except Exception as e:
                    count += 1
                    print(sys._getframe().f_code.co_name, e)
                    self.proxy_ip = self.proxy_pool.get()
                    if count > 10:
                        print(category_tree, page_url)
                        component_tags = []
                        break

                    if count > 100:
                        self.proxy_pool._refresh()

            for component_tag in component_tags:
                detail_table = component_tag.find(name="table",
                                                  attrs={"class": "TFtable"})
                td_tags = component_tag.find_all(name="td")
                try:
                    component_code = td_tags[1].text.strip()
                except Exception as e:
                    print("component code is None", e)
                    continue
                try:
                    component_img = td_tags[1].find(name="img",
                                                    attrs={
                                                        "class":
                                                        "productThumbnail"
                                                    }).get("src")
                except:
                    component_img = ""
                try:
                    rough_attach = td_tags[2].find(name="a", text="数据表")
                    if not rough_attach:
                        rough_attach = td_tags[2].find(
                            name="a", attrs={"class": "prodDetailsAttachment"})
                    component_attach = rough_attach.get("href")
                    if "http" not in component_attach:
                        component_attach = ""
                except Exception as e:
                    component_attach = ""
                try:
                    manufacture_description = td_tags[3].a.find_all(name="p")
                    component_brand = manufacture_description[0].text.strip()
                    component_description = manufacture_description[
                        1].text.strip()
                except Exception as e:
                    component_brand = ""
                    print(sys._getframe().f_code.co_name, e)
                    continue
                if not component_img and not component_attach and not component_brand:
                    continue

                component = (component_code, component_brand,
                             first_category_name, second_category_name,
                             page_url, component_attach, component_img)
                count = 0
                while True:
                    try:
                        orcl_conn = OracleSave(1000003)
                        orcl_conn.component_insert(component)
                        if detail_table:
                            property_tags = detail_table.find_all(name="tr")
                            for property_tag in property_tags:
                                detail_td_tags = property_tag.find_all("td")
                                property_name = detail_td_tags[0].text.strip()
                                property_value = detail_td_tags[1].text.strip()
                                key_value = (property_name, property_value)
                                orcl_conn.properties_insert(key_value)
                        orcl_conn.commit()
                        orcl_conn.conn.close()

                        break
                    except Exception as e:
                        print(e)
                        count += 1