def thread_go(self, page_url): many_components_properties = self.detail.get_page_components(page_url) for component_properties in many_components_properties: component, single_properties = component_properties while True: try: orcl_conn = OracleSave(1111111) orcl_conn.component_insert(component) for properties in single_properties: orcl_conn.properties_insert(properties) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e, "存储错误") return
def page_thread_go(self, page_category): first_category_name, second_category_name, page_url = page_category count = 0 my_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", } while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all( name="tr", attrs={"class": re.compile(r"SearchResult")}) break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 20: self.proxy_pool._refresh() table_header_tags = component_tags[0].find_all(name="th")[11:] for component_tag in component_tags[2:]: td_tags = component_tag.find_all(name="td") try: rough_component_code = td_tags[3].text.strip() no = len(rough_component_code) for num, code_str in enumerate(rough_component_code): if code_str == "\n": no = num break component_code = rough_component_code[:no] except Exception as e: print("component code is None", e) continue try: component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/", "/images/") except: component_img = "" try: rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表")) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: print("pdf is none", page_url, component_code) component_attach = "" # if not component_img: # continue try: component_brand = td_tags[4].a.text except Exception as e: print(sys._getframe().f_code.co_name, e) continue component = ( component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 try: rohs_tag = td_tags[10] except Exception as e: print(e) continue property_key_values = [] if rohs_tag.text == "详细信息": key_value = ("RoHS", "Yes") property_key_values.append(key_value) len_heads = len(table_header_tags) if len_heads: for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]): property_name = name_tag.text.strip() property_value = property_tag.text.strip() key_value = (property_name, property_value) property_key_values.append(key_value) while True: try: orcl_conn = OracleSave(1000002) orcl_conn.component_insert(component) for key_value in property_key_values: orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print("database save exception", e) count += 1
def thread_go(page_no): print("Page:", page_no) page_parts = range(0, 25, 5) for page_part in page_parts: print("Part:", page_part) # def thread_go(page_part): complete_form_data['nextSearchIndex'] = page_part complete_form_data['dispPageNo'] = page_no complete_form_data['type'] = "page" detail_url = second_category_url + "&dispPageNo=%d" % page_no while True: try: my_session.cookies.update(request_cookie) res = my_session.post(detail_url, data=complete_form_data, proxies=self.proxy_ip, timeout=20) print(res.status_code) if res.status_code == 200: content = res.content.decode() bs_content = BeautifulSoup(content, "lxml") tr_tags = bs_content.find_all(name="tr")[1:] if tr_tags: break else: self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() tr_tags = bs_content.find_all(name="tr")[1:] if not tr_tags: continue # 数据库连接 orcl_conn = OracleSave(1000001) for tr_tag in tr_tags: try: code = tr_tag.td.find(name="p", attrs={ "class": "text14pt2 bold" }).text.strip() except Exception as e: print(e) continue chip1stop_code = tr_tag.td.find(name="p", attrs={ "class": "text10" }).text.strip() print(chip1stop_code) maker = tr_tag.td.find(name="p", attrs={ "class": "text10 wordBreak" }).text.strip() pdf_url = tr_tag.find( name="a", attrs={ "href": re.compile( r"http://download\.siliconexpert\.com/pdfs") }) if pdf_url: pdf_url = pdf_url.get("href") component = (code, maker, first_category_name, second_category_name, second_category_url, pdf_url, None) orcl_conn.component_insert(component) property_tags = tr_tag.find_all(name="td")[6:-1] for property_name, property_tag in zip( property_names, property_tags): if property_name == '购买/询价': continue property_value = property_tag.text if property_value: property_value = property_value.strip() single_property = (property_name, property_value) orcl_conn.properties_insert(single_property) orcl_conn.commit() orcl_conn.conn.close()
def thread_go(self, page_attributes): cc_unit, cc_kiname, cc_url = page_attributes html_analyse = HtmlAnalyse(cc_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) brand_tag = bs_content.find(name="span", attrs={"itemprop": "brand"}) name_tag = bs_content.find(name="span", attrs={"itemprop": "mpn"}) if not brand_tag or not name_tag: return cc_brandname = brand_tag.text.strip() cc_code = name_tag.text.strip() img_tag = bs_content.find(name="img", attrs={"itemprop": "image"}) if not img_tag: cc_img = "" else: cc_img = Rs_Pre_Url + img_tag.get("src") attach_tag = bs_content.find( name="a", attrs={"onclick": re.compile(r"window\.open\('http://docs")}) if not attach_tag: cc_attach = "" else: attach_name = attach_tag.get("onclick") try: cc_attach = re.match(r"window\.open\('(.*?\.pdf)'\)", attach_name).group(1) except Exception as e: print(sys._getframe().f_code.co_name, e) cc_attach = "" component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url, cc_attach, cc_img) # 器件属性 while True: try: orcl_conn = OracleSave(1000005) orcl_conn.component_insert(component) component_properties = [] tr_tags = bs_content.find_all( name="tr", attrs={"class": re.compile(r"dr-table-row")}) for tr_tag in tr_tags: td_tags = tr_tag.find_all(name="td") parameter_name = td_tags[1].text parameter_value = td_tags[2].text component_property = (parameter_name, parameter_value) component_properties.append(component_property) orcl_conn.properties_insert(component_property) orcl_conn.commit() break except Exception as e: print(sys._getframe().f_code.co_name, e) finally: orcl_conn.conn.close()
def extra_thread(page_num): page_url = url + "/prl/results/" + str(page_num) count = 0 while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find( name="table", id="sProdList").tbody.find_all(name="tr") break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 10: print(category_tree, page_url) component_tags = [] break if count > 100: self.proxy_pool._refresh() for component_tag in component_tags: detail_table = component_tag.find(name="table", attrs={"class": "TFtable"}) td_tags = component_tag.find_all(name="td") try: component_code = td_tags[1].text.strip() except Exception as e: print("component code is None", e) continue try: component_img = td_tags[1].find(name="img", attrs={ "class": "productThumbnail" }).get("src") except: component_img = "" try: rough_attach = td_tags[2].find(name="a", text="数据表") if not rough_attach: rough_attach = td_tags[2].find( name="a", attrs={"class": "prodDetailsAttachment"}) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: component_attach = "" try: manufacture_description = td_tags[3].a.find_all(name="p") component_brand = manufacture_description[0].text.strip() component_description = manufacture_description[ 1].text.strip() except Exception as e: component_brand = "" print(sys._getframe().f_code.co_name, e) continue if not component_img and not component_attach and not component_brand: continue component = (component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 while True: try: orcl_conn = OracleSave(1000003) orcl_conn.component_insert(component) if detail_table: property_tags = detail_table.find_all(name="tr") for property_tag in property_tags: detail_td_tags = property_tag.find_all("td") property_name = detail_td_tags[0].text.strip() property_value = detail_td_tags[1].text.strip() key_value = (property_name, property_value) orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e) count += 1