def get_series_urls(self, list_url): def get_pages_urls(url): html_analyse = HtmlAnalyse(url) bs_contents = html_analyse.get_bs_contents() page_urls = [] page = len( bs_contents.find_all(name="li", attrs={"class": "pager-item" })) + 1 for i in range(page): page_url = url.split('#')[0] + "?page=" + str( i) + Third_Suffix_Url page_urls.append(page_url) return page_urls series_urls = [] page_urls = get_pages_urls(list_url) if page_urls is None: return None for page_url in page_urls: html_analyse = HtmlAnalyse(page_url) bs_contents = html_analyse.get_bs_contents() tags = bs_contents.find_all( name='tr', attrs={"class": re.compile(u"ecatalog-series-table")}) for tag in tags: try: href = tag.find_all(name="td")[0].a.get("href") m_url = Panasonic_Pre_Url + href + '&limit=100' series_urls.append(m_url) except Exception as e: href = tag.find_all(name="td")[1].a.get("href") m_url = Panasonic_Pre_Url + href + '&limit=100' series_urls.append(m_url) return series_urls
def file_download(self, url, file_type, file_name=str(random.random())): download_dir_path = "..\\download_files\\" if not os.path.exists(download_dir_path): os.mkdir(download_dir_path) download_file_path = download_dir_path + file_name + file_type if os.path.exists(download_file_path): return try_count = 0 while True: try: download_file_path = download_dir_path + str( random.random()) + file_type # html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip) html_analyse = HtmlAnalyse(url) html_analyse.download(download_file_path) print("File Download Success !!") break except Exception as e: print(sys._getframe().f_code.co_name, url, e) try_count += 1 if try_count > 2 and "https" in url: return if try_count > 5: return self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() # download_file_path = download_dir_path + str(random.random()) + file_type return download_file_path
def csv_download(self): html_analyse = HtmlAnalyse(self.url) filename = self.path + 'schottkyBarrierDiodes.csv' if os.path.exists(filename): return filename html_analyse.download(filename) return filename
def get_supplier(self, url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() ul_tag = bs_content.find(name="ul", attrs={"class": "txl_content_con_L"}) supplier_name = ul_tag.h1.text.strip() supplier_place = ul_tag.li.text.split(":", 2)[1].replace("\n", " ").strip() supplier_contact = ul_tag.find(name="li", text=re.compile(r'联系人:')).text.split( ":", 2)[1].strip() supplier_fax = ul_tag.find( name="li", text=re.compile(r'传真:')).text.split(":", 2)[1].strip() supplier_phone = ul_tag.find(name="li", text=re.compile(r'公司 联系电话:')).text.split( ":", 2)[1].strip() supplier_mobile = ul_tag.find(name="li", text=re.compile(r'手机号码:')).text.split( ":", 2)[1].strip() supplier_address = ul_tag.find(name="li", text=re.compile(r'联系地址:')).text.split( ":", 2)[1].strip() line = (supplier_name, supplier_place, supplier_contact, supplier_fax, supplier_phone, supplier_mobile, supplier_address) print(line) return line
def get_categories(self): main_url = "http://www.chip1stop.com/web/CHN/zh/dispClassSearchTop.do" self.proxy_ip = self.proxy_pool.get() while True: try: html_analsye = HtmlAnalyse(main_url, proxy=self.proxy_ip) bs_content = html_analsye.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() dl_tags = bs_content.find_all( name="dl", attrs={"class": "categoryListDl clearfix"}) second_categories = [] for dl_tag in dl_tags: first_directory_name = dl_tag.dt.text second_directory_tags = dl_tag.find_all(name="dd") for second_directory_tag in second_directory_tags: rough_second_directory_name = second_directory_tag.text second_directory_name = re.match( r"(.*?)\[", rough_second_directory_name).group(1).strip() second_directory_url = "http://www.chip1stop.com/web/CHN/zh" + second_directory_tag.span.a.get( "href")[1:] second_directory = (first_directory_name, second_directory_name, second_directory_url) second_categories.append(second_directory) return second_categories
def thread_go(page_url): html_analyse = HtmlAnalyse(page_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(e) company_tags = bs_content.find_all(name="a", attrs={ "target": "_blank", "href": re.compile(r"/\d+") }) corporations = [] for company_tag in company_tags: corporation = company_tag.text.strip() corporation_dict = { "corporation": corporation, "province_url": city_url, "page_url": page_url, "状态": "未完成", "from": "99114" } corporations.append(corporation) col = self.db.All_Company_Name col.insert(corporation_dict) print(corporations) return corporations
def get_second_category(self): while True: try: html_analyse = HtmlAnalyse( "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/", proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() first_categories = bs_content.find_all( name="div", attrs={"class": "horizontalMenu sectionUp"}) second_categories = [] for first_category in first_categories: first_category_name = first_category.span.text ul_tags = first_category.find_all(name="ul", attrs={"class": "column1"}) for ul_tag in ul_tags: li_tags = ul_tag.find_all(name="li") for li_tag in li_tags: second_category_url = Rs_Pre_Url + li_tag.a.get("href") second_category_name = li_tag.a.text.replace( li_tag.a.span.text, "").strip() second_category = (first_category_name, second_category_name, second_category_url) second_categories.append(second_category) return second_categories
def get_product_list(self): html_analyse = HtmlAnalyse(self.url) data = { "Category_Radio": "Rchip", "Feature_Radio": "Rchip_RFAtten", "CATEGORY": "Rchip", "FEATURE": "Rchip_RFAtten", "INDUCTANCE": "", "TOLERANCE": "", "IMPEDANCE": "", "SIZE": "", "POWER": "", "RESISTANCE": "", "TCR": "", "CAPACITANCE": "", "TC": "", "VOLTAGE": "", "FREQUENCY": "", "INSERTIONLOSS": "", "LIFETIME": "", "ANTENNA": "", "ISSEARCH": "OK" } contents = html_analyse.post_contents(data=data).encode().decode() bs_contents = BeautifulSoup(contents, "html.parser") product_tags = bs_contents.find_all(name="tr")[1:] return product_tags
def thread_go(page_url): html_analyse = HtmlAnalyse(page_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(e) ul_tag = bs_content.find(name="div", attrs={"class": "leftbox comlist"}) li_tags = ul_tag.find_all(name="li") corporations = [] for li_tag in li_tags: corporation = li_tag.text.strip() corporation_dict = { "corporation": corporation, "province_url": province_url, "page_url": page_url, "状态": "未完成" } corporations.append(corporation) col = self.db.All_Company_Name col.insert(corporation_dict) print(corporations) return corporations
def __init__(self, url_pdf): self.code = url_pdf[0] self.url = url_pdf[1] self.pdf = url_pdf[2] html_analyse = HtmlAnalyse(self.url) self.content = html_analyse.get_contents()
def get_products_list(self, series_url): def get_pages_urls(url): html_analyse = HtmlAnalyse(url, is_proxy=True) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="a", attrs={"title": "到最后一页"}, text="末页 »") try: rough_page = page_tag.get("href") page = re.match(r"/ea/products/.*?page=(\d+)&reset=1", rough_page).group(1) except: page = 0 page_urls = [] for i in range(int(page) + 1): page_url = url + "&page=" + str(i) page_urls.append(page_url) return page_urls product_lists = [] page_urls = get_pages_urls(series_url) if page_urls is None: return None for page_url in page_urls[:]: html_analyse = HtmlAnalyse(page_url) bs_contents = html_analyse.get_bs_contents() product_list = bs_contents.find_all( name='tr', attrs={"class": re.compile(u"(^odd$)|(^even$)")})[1:] if not product_list: continue product_lists += product_list return product_lists
def get_product_list(self): html_analyse = HtmlAnalyse( "http://www.ti.com.cn/wsapi/paramdata/family/660/results?lang=cn&output=json" ) contents = html_analyse.get_contents() datas_jsons = json.loads(contents) return datas_jsons["ParametricResults"]
def get_pages_urls(url): html_analyse = HtmlAnalyse(url) bs_contents = html_analyse.get_bs_contents() page_urls = [] page = len(bs_contents.find_all(name="li", attrs={"class": "pager-item"})) + 1 for i in range(page): page_url = url.split('#')[0] + "?page=" + str(i) + Third_Suffix_Url page_urls.append(page_url) return page_urls
def get_attach(detail_url): html_analyse = HtmlAnalyse(detail_url) bs_content = html_analyse.get_bs_contents() rough_attach = bs_content.find(name="a", id="displayedPath") try: attach = ST_Pre_Url + rough_attach.get("href") except: attach = "" return attach
def get_product_list(self): get_url = "http://zh-cn.kionix.com/html/json_req.php?url=http%3A//solr-lb-1878662441.ap-northeast-1.elb.amazonaws.com/solr/solr-slave/select/%3Fstart%3D0%26rows%3D100%26wt%3Djson%26json.nl%3Dmap%26facet%3Don%26facet.mincount%3D1%26facet.field%3DPartNumber_copy%26facet.field%3DPartName_copy%26facet.field%3DProductSupplyStatusText_copy%26facet.field%3DPackageShortCode_copy%26facet.field%3DAxis_num%26facet.field%3DGRange_copy%26facet.field%3DSensitivity_copy%26facet.field%3DNoise_num%26facet.field%3DResolution_copy%26facet.field%3DPackageSize_copy%26facet.field%3DPackagePins_copy%26facet.field%3DPackageType_copy%26facet.field%3DInterfaceOutput_copy%26facet.field%3DWakeUp_copy%26facet.field%3DFifoFiloBuffer_copy%26facet.field%3DOperatingTemperatureMin_num%26facet.field%3DOperatingTemperatureMax_num%26facet.field%3DSupplyVoltage_copy%26facet.field%3DCurrentConsumption_copy%26facet.field%3DPartHighlightText_copy%26sort%3DPS_PartNumber%20asc%26q%3D%28PS_ProductDivisionCode%3A701010%20OR%20PS_ProductGroupCode%3A701010%20OR%20PS_ProductFamilyCode%3A701010%20OR%20PS_ProductTypeCode%3A701010%20OR%20PS_ProductSubTypeCode%3A%20701010%29%20AND%20ProductDisplayFlag_num%3A%5B1%20TO%20*%5D%20AND%20PS_PartStatus%3A60&jsonp_callback=jQuery17103304592033228262_1479346785834&_=1479346786175" html_analyse = HtmlAnalyse(get_url) rough_data_json = re.match( r'jQuery17103304592033228262_1479346785834\((.*?)\);$', html_analyse.get_contents()).group(1) data_json = json.loads(rough_data_json) productlist_json = data_json["response"]["docs"] return self.url, productlist_json
def get_product_list(self): html_analyse = HtmlAnalyse(self.url) bs_contents = html_analyse.get_bs_contents() rough_products = bs_contents.find_all(name="tr", attrs={"class": "products"}) products_urls = [] for rough_product in rough_products: url = Infineon_Pre_Url + rough_product.td.a.get("href") products_urls.append(url) return products_urls
def get_supplier_urls(self, url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() ul_tags = bs_content.find_all(name="ul", attrs={"class": "sheng_weizhi_lb"}) urls = [] for ul_tag in ul_tags: url = "http://book.youboy.com" + ul_tag.div.strong.a.get("href") urls.append(url) return urls
def get_img(url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() rough_img = bs_content.find(name="img", id="productImageId") try: img = rough_img.get("src") except: print("未获取图片", url) img = '' return img
def get_code_urls(self, series_url): def get_pages_urls(url): html_analyse = HtmlAnalyse(url, is_proxy=True) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="a", attrs={"title": "到最后一页"}, text="末页 »") try: rough_page = page_tag.get("href") page = re.match(r"/ea/products/.*?page=(\d+)&reset=1", rough_page).group(1) except: page = 0 page_urls = [] for i in range(int(page) + 1): page_url = url + "&page=" + str(i) page_urls.append(page_url) return page_urls product_urls = [] page_urls = get_pages_urls(series_url) if page_urls is None: return None for page_url in page_urls[:]: html_analyse = HtmlAnalyse(page_url) bs_contents = html_analyse.get_bs_contents() lists = bs_contents.find_all( name='tr', attrs={"class": re.compile(u"(^odd$)|(^even$)")}) if not lists: continue for list in lists[1:]: try: model = list.td.a code = model.text except: break # *******去重******* orcl_con = OracleConnection() cursor = orcl_con.conn.cursor() cursor.execute( "select cc_id from product$component_crawl where cc_code='{}'" .format(code)) data = cursor.fetchone() if data: print("repeat") continue cursor.close() orcl_con.conn.close() # *******结束******* href = model.get("href") url = Panasonic_Pre_Url + href product_urls.append(url) return product_urls
def get_product_url(self): html_analyse = HtmlAnalyse(self.url) bs_content = html_analyse.get_bs_contents() rough_products = bs_content.find_all(name="div", attrs={"class": "section-devices"}) img = Atmel_Pre_Url + bs_content.find(name="img", attrs={"src": re.compile(r'/Images/.*?\.jpg')}).get("src") imgs_urls = [] for rough_product in rough_products: product_url = Atmel_Pre_Url + rough_product.a.get("href") img_url = (img, product_url) imgs_urls.append(img_url) return imgs_urls
def get_pdf(url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() pdf = bs_content.find( name="a", attrs={"href": re.compile(r'/ac/c_download/.*?\.pdf')}) if pdf: pdf_url = pdf.get("href") + "&via=ok" else: pdf_url = '' return pdf_url
def belling(url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() pdf_tags = bs_content.find_all(name="a", attrs={"href": re.compile(r".*?\.pdf$")}) hrefs = [] for pdf_tag in pdf_tags: href = pdf_tag.get("href") print(href) hrefs.append(href) return hrefs
def get_pdf(url): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() pdf = bs_content.find( name="a", attrs={ "href": re.compile( r'/ac/c_download/control/relay/photomos/catalog/semi_cn_' ) }) pdf_url = pdf.get("href") + "&via=ok" return pdf_url
def download(self, pdf_url): filename = self.path + str(random.random()) + '.pdf' try: html_analyse = HtmlAnalyse(pdf_url, proxy=self.proxy_ip) html_analyse.download(filename) print("下载完成。。。") except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() self.download(pdf_url) return filename
def download(self, pdf_url): content_list = re.match(r'downloadLinkClick\((.*?)\);return false', a).group(1).split(",") filename = content_list[0].replace("'", "") url = "http://ds.yuden.co.jp/TYCOMPAS/cs/detail.do?mode=download&fileName=" + filename isSeriesData = content_list[1] isProductsData = content_list[2] isProductsDataGraph = content_list[3] DownloadForm = { "action": "detail.do", "classificationID": "AE", "fileName": filename, "isSeriesData": isSeriesData, "isProductsData": isProductsData, "isProductsDataGraph": isProductsDataGraph } html_analyse = HtmlAnalyse(url) html_analyse.post_download( data=DownloadForm, path="I:\PythonPrj\StandardSpider\DataAnalyse\\NewRules\\a.pdf") filename = self.path + str(random.random()) + '.pdf' try: html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip) html_analyse.download(filename) print("下载完成。。。") except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() self.download(pdf_url) return filename
def get_pages_urls(url): html_analyse = HtmlAnalyse(url, is_proxy=True) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="a", attrs={"title": "到最后一页"}, text="末页 »") try: rough_page = page_tag.get("href") page = re.match(r"/ea/products/.*?page=(\d+)&reset=1", rough_page).group(1) except: page = 0 page_urls = [] for i in range(int(page) + 1): page_url = url + "&page=" + str(i) page_urls.append(page_url) return page_urls
def get_all_content(self): many_contents = [] for series_url in self.series_urls: if series_url == "http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=search": session = requests.session() session.headers.update({ 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'max-age=0', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'device.panasonic.cn', 'Origin': 'http://device.panasonic.cn', 'Referer': 'http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=search', 'Upgrade-Insecure-Requests': '1' }) form = { 'pagecnt': 1, 'maxrows': 20, 'topage': 2, 'VAL_3_3286': '', 'VAL_3_3433': '', 'VAL_3_3287': '', 'VAL_3_3436': '', 'part_no': '' } content0 = session.get(series_url).text bs_contents0 = BeautifulSoup(content0, "html.parser") many_contents.append(bs_contents0) content1 = session.post( "http://device.panasonic.cn/ac/c/control/sensor/human/wl/number/index.jsp?c=move", data=form).text bs_contents1 = BeautifulSoup(content1, "html.parser") many_contents.append(bs_contents1) else: html_analyse = HtmlAnalyse(series_url) bs_contents = html_analyse.get_bs_contents() many_contents.append(bs_contents) return many_contents
def thread_go(self, parameters): url = parameters html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() tr_tags = bs_content.find(name="tbody").find_all(name="tr") for tr_tag in tr_tags: td_tags = tr_tag.find_all("td") property_name = td_tags[0].text.strip() value_tag = td_tags[1] if property_name == "料号": cc_code = value_tag.text.strip() elif property_name == "品牌": cc_brandname = value_tag.text.strip() elif property_name == "规格书": cc_attach = value_tag.get("href") property_value = td_tags[1] component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url, cc_attach, cc_img)
def get_suppliers(self): def thread_go(page_url): html_analyse = HtmlAnalyse(page_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(e) ul_tag = bs_content.find(name="div", attrs={"class": "leftbox comlist"}) li_tags = ul_tag.find_all(name="li") corporations = [] for li_tag in li_tags: corporation = li_tag.text.strip() corporation_dict = { "corporation": corporation, "province_url": province_url, "page_url": page_url, "状态": "未完成" } corporations.append(corporation) col = self.db.All_Company_Name col.insert(corporation_dict) print(corporations) return corporations for province_id in range(1, 36): province_url = "http://www.soudh.com/province-" + str( province_id) + ".html" html_analyse = HtmlAnalyse(province_url) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="span", text=re.compile(r'当前为')) page_count = int(re.match(r'.*?共(\d+)页', page_tag.text).group(1)) page_urls = map( lambda page_num: province_url[:-5] + "-" + str(page_num) + ".html", range(1, page_count + 1)) # # for page_url in page_urls: # thread_go(page_url) threading_pool = ThreadingPool() threading_pool.multi_thread(thread_go, page_urls)
def get_first_classes( self, url="http://www.mlcc1.com/search_simplex.html?searchkey="): html_analyse = HtmlAnalyse(url) bs_content = html_analyse.get_bs_contents() first_tag_names = bs_content.find_all(name="p", attrs={"class": "down"}) first_class_contents = bs_content.find_all( name="ul", attrs={"class": re.compile(r'mlcc_\d+_list')}) first_classs = [] for first_tag_name, first_class_content in zip(first_tag_names, first_class_contents): first_class_name = re.match(r'(.*?) (.*?', first_tag_name.text).group(1) # first_class_name = first_tag_name.text.replace(' ', '') first_class = (first_class_name, first_class_content) first_classs.append(first_class) return first_classs