class FileSystem: def __init__(self): self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() def file_download(self, url, file_type, file_name=str(random.random())): download_dir_path = "..\\download_files\\" if not os.path.exists(download_dir_path): os.mkdir(download_dir_path) download_file_path = download_dir_path + file_name + file_type if os.path.exists(download_file_path): return try_count = 0 while True: try: download_file_path = download_dir_path + str( random.random()) + file_type # html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip) html_analyse = HtmlAnalyse(url) html_analyse.download(download_file_path) print("File Download Success !!") break except Exception as e: print(sys._getframe().f_code.co_name, url, e) try_count += 1 if try_count > 2 and "https" in url: return if try_count > 5: return self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() # download_file_path = download_dir_path + str(random.random()) + file_type return download_file_path def file_upload(self, local_file_path): if not local_file_path: return while True: try: with open(local_file_path, "rb") as f: res = requests.post(File_Server_Url, files={'file': f}) if res.status_code == 200: res_j = res.json() break except Exception as e: print(sys._getframe().f_code.co_name, e) server_file_path = res_j["path"] print("File Server Upload Success !!") return server_file_path def download_upload(self, url, file_type): download_file_path = self.file_download(url, file_type) server_file_path = self.file_upload(download_file_path) return server_file_path
class MouserGo: def __init__(self): self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() self.mouser_host_url = "http://www.mouser.cn" self.my_session = requests.session() def get_all_category(self): while True: try: self.my_session.proxies.update(self.proxy_ip) my_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36', "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.mouser.cn/Electronic-Components/", } self.my_session.headers.update(my_headers) res = self.my_session.get("http://www.mouser.cn/Electronic-Components/") if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") category_url_tags = bs_content.find_all(name="a", attrs={"class": "SearchResultsSubLevelCategory"}) if not category_url_tags: print(sys._getframe().f_code.co_name, "category_url_tag is None") continue break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() multi_category_structures = [] for category_url_tag in category_url_tags: url = self.mouser_host_url + category_url_tag.get("href")[2:] single_category_structures = self.get_detail_category(url) multi_category_structures += single_category_structures return multi_category_structures def get_detail_category(self, url): while True: try: detail_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36', "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.mouser.cn/Electronic-Components/", } self.my_session.proxies.update(self.proxy_ip) self.my_session.headers.update(detail_headers) res = self.my_session.get(url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") first_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl01_lnkBreadcrumb") if not first_category_tag: self.proxy_pool.remove(self.proxy_ip) print("None, go on") self.proxy_ip = self.proxy_pool.get() continue break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() first_category_name = first_category_tag.text second_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl02_lnkBreadcrumb") second_category_name = second_category_tag.text third_category_tag = bs_content.find(name="a", id="ctl00_ContentMain_bc_rptrBreadcrumbs_ctl03_lnkBreadcrumb") if third_category_tag: third_category_name = third_category_tag.text else: third_category_name = second_category_name detail_category_tags = bs_content.find_all(name="div", attrs={"class": "div-cat-title"}) category_structures = [] if detail_category_tags: pre_category_url = re.match(r"(.+)/_/.+/$", url).group(1) for detail_category_tag in detail_category_tags: forth_category_tag = detail_category_tag.a forth_category_name = forth_category_tag.text forth_category_url = pre_category_url + forth_category_tag.get("href")[5:] component_count = detail_category_tag.span.span.text.replace(",", "") category_structure = ( first_category_name, second_category_name, third_category_name, forth_category_name, forth_category_url, component_count) category_structures.append(category_structure) else: forth_category_name = third_category_name forth_category_url = url component_count_tag = bs_content.find(name="span", id="ctl00_ContentMain_lblProductCount") component_count = component_count_tag.text.replace("(", "").replace(")", "").replace(",", "") category_structure = ( first_category_name, second_category_name, third_category_name, forth_category_name, forth_category_url, component_count) category_structures.append(category_structure) print(category_structures) return category_structures def category_to_csv(self, category_structure): with open("..\\Mouser.csv", "w", encoding="utf-8") as f: for category_structure in category_structure: modify_category_structure = [] for category_name in category_structure: modify_category_name = category_name.replace(",", ",") modify_category_structure.append(modify_category_name) line = (",".join(modify_category_structure)) + "\n" f.write(line.encode().decode()) def read_from_csv(self): csv_categories = [] with open("..\\Mouser.csv", "r", encoding="utf-8") as f: read = csv.reader(f) for line in read: print(line) csv_categories.append(line) return csv_categories def thread_go(self, category_tree): first_category_name = category_tree[0].replace("\ufeff", "") second_category_name = str(category_tree[1:-2]) url, component_count = category_tree[-2:] if component_count == 1: return my_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", } for page_num in range(0, int(component_count), 25): page_url = url + "?No=" + str(page_num) count = 0 while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all( name="tr", attrs={"class": re.compile(r"SearchResult")}) break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 20: self.proxy_pool._refresh() table_header_tags = component_tags[0].find_all(name="th")[11:] for component_tag in component_tags[2:]: td_tags = component_tag.find_all(name="td") try: rough_component_code = td_tags[3].text.strip() no = len(rough_component_code) for num, code_str in enumerate(rough_component_code): if code_str == "\n": no = num break component_code = rough_component_code[:no] except Exception as e: print("component code is None", e) continue try: component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/", "/images/") except: component_img = "" try: rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表")) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: print("pdf is none", page_url, component_code) component_attach = "" # if not component_img: # continue try: component_brand = td_tags[4].a.text except Exception as e: print(sys._getframe().f_code.co_name, e) continue component = ( component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 try: rohs_tag = td_tags[10] except Exception as e: print(e) continue property_key_values = [] if rohs_tag.text == "详细信息": key_value = ("RoHS", "Yes") property_key_values.append(key_value) len_heads = len(table_header_tags) if len_heads: for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]): property_name = name_tag.text.strip() property_value = property_tag.text.strip() key_value = (property_name, property_value) property_key_values.append(key_value) while True: try: orcl_conn = OracleSave(1000002) orcl_conn.component_insert(component) for key_value in property_key_values: orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e) count += 1 # if count > 3: # break def get_page_url(self, category_trees): pages_category = [] for category_tree in category_trees: first_category_name = category_tree[0].replace("\ufeff", "") second_category_name = str(category_tree[1:-2]) url, component_count = category_tree[-2:] if component_count == 1: continue for page_num in range(0, int(component_count), 25): page_url = url + "?No=" + str(page_num) page_category = (first_category_name, second_category_name, page_url) pages_category.append(page_category) return pages_category def page_thread_go(self, page_category): first_category_name, second_category_name, page_url = page_category count = 0 my_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36', "Host": "www.mouser.cn", "Upgrade-Insecure-Requests": "1", } while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find(name="table", attrs={"class": "SearchResultsTable"}).find_all( name="tr", attrs={"class": re.compile(r"SearchResult")}) break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 20: self.proxy_pool._refresh() table_header_tags = component_tags[0].find_all(name="th")[11:] for component_tag in component_tags[2:]: td_tags = component_tag.find_all(name="td") try: rough_component_code = td_tags[3].text.strip() no = len(rough_component_code) for num, code_str in enumerate(rough_component_code): if code_str == "\n": no = num break component_code = rough_component_code[:no] except Exception as e: print("component code is None", e) continue try: component_img = self.mouser_host_url + td_tags[1].find(name="img").get("src").replace("/sm/", "/images/") except: component_img = "" try: rough_attach = td_tags[6].find(name="a", text=re.compile(r".*数据表")) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: print("pdf is none", page_url, component_code) component_attach = "" # if not component_img: # continue try: component_brand = td_tags[4].a.text except Exception as e: print(sys._getframe().f_code.co_name, e) continue component = ( component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 try: rohs_tag = td_tags[10] except Exception as e: print(e) continue property_key_values = [] if rohs_tag.text == "详细信息": key_value = ("RoHS", "Yes") property_key_values.append(key_value) len_heads = len(table_header_tags) if len_heads: for name_tag, property_tag in zip(table_header_tags, td_tags[-len_heads:]): property_name = name_tag.text.strip() property_value = property_tag.text.strip() key_value = (property_name, property_value) property_key_values.append(key_value) while True: try: orcl_conn = OracleSave(1000002) orcl_conn.component_insert(component) for key_value in property_key_values: orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print("database save exception", e) count += 1
class RsGo: def __init__(self): self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() def get_second_category(self): while True: try: html_analyse = HtmlAnalyse( "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/", proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() first_categories = bs_content.find_all( name="div", attrs={"class": "horizontalMenu sectionUp"}) second_categories = [] for first_category in first_categories: first_category_name = first_category.span.text ul_tags = first_category.find_all(name="ul", attrs={"class": "column1"}) for ul_tag in ul_tags: li_tags = ul_tag.find_all(name="li") for li_tag in li_tags: second_category_url = Rs_Pre_Url + li_tag.a.get("href") second_category_name = li_tag.a.text.replace( li_tag.a.span.text, "").strip() second_category = (first_category_name, second_category_name, second_category_url) second_categories.append(second_category) return second_categories def get_page_url(self, second_category): first_category_name, second_category_name, second_category_url = second_category while True: try: html_analyse = HtmlAnalyse(second_category_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() ul_tag = bs_content.find(name="ul", attrs={"class": "brcategories"}) third_category_tags = ul_tag.find_all( name="div", attrs={"class": "rsGARealEstate"}) for third_category_tag in third_category_tags: third_category_name = third_category_tag.a.text third_category_url = Rs_Pre_Url + third_category_tag.a.get("href") while True: try: html_analyse = HtmlAnalyse(third_category_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() try: page_tag = bs_content.find(name="div", attrs={ "class": "viewProdDiv" }).text except Exception as e: print(third_category_url, e, "找不到page_tag") continue flag = re.match(r".*?共(.*?)个", page_tag) page_count = int(int(flag.group(1).strip()) / 20 + 1) for page_num in range(int(page_count)): page_url = third_category_url + "?pn=" + str(page_num + 1) while True: try: html_analyse = HtmlAnalyse(page_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() component_url_tags = bs_content.find_all( name="a", attrs={"class": "tnProdDesc"}) page_attributes = [] for component_url_tag in component_url_tags: component_url = Rs_Pre_Url + component_url_tag.get("href") union_category_name = second_category_name + "---" + third_category_name page_attribute = (first_category_name, union_category_name, component_url) page_attributes.append(page_attribute) # threadingpool = ThreadingPool(4) threadingpool.multi_process(self.thread_go, page_attributes) # for page_attribute in page_attributes: # self.thread_go(page_attribute) continue def thread_go(self, page_attributes): cc_unit, cc_kiname, cc_url = page_attributes html_analyse = HtmlAnalyse(cc_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) brand_tag = bs_content.find(name="span", attrs={"itemprop": "brand"}) name_tag = bs_content.find(name="span", attrs={"itemprop": "mpn"}) if not brand_tag or not name_tag: return cc_brandname = brand_tag.text.strip() cc_code = name_tag.text.strip() img_tag = bs_content.find(name="img", attrs={"itemprop": "image"}) if not img_tag: cc_img = "" else: cc_img = Rs_Pre_Url + img_tag.get("src") attach_tag = bs_content.find( name="a", attrs={"onclick": re.compile(r"window\.open\('http://docs")}) if not attach_tag: cc_attach = "" else: attach_name = attach_tag.get("onclick") try: cc_attach = re.match(r"window\.open\('(.*?\.pdf)'\)", attach_name).group(1) except Exception as e: print(sys._getframe().f_code.co_name, e) cc_attach = "" component = (cc_code, cc_brandname, cc_unit, cc_kiname, cc_url, cc_attach, cc_img) # 器件属性 while True: try: orcl_conn = OracleSave(1000005) orcl_conn.component_insert(component) component_properties = [] tr_tags = bs_content.find_all( name="tr", attrs={"class": re.compile(r"dr-table-row")}) for tr_tag in tr_tags: td_tags = tr_tag.find_all(name="td") parameter_name = td_tags[1].text parameter_value = td_tags[2].text component_property = (parameter_name, parameter_value) component_properties.append(component_property) orcl_conn.properties_insert(component_property) orcl_conn.commit() break except Exception as e: print(sys._getframe().f_code.co_name, e) finally: orcl_conn.conn.close() def csv_write(self, category_structures): with open("..\\Rs-online.csv", "w", encoding="utf-8") as f: for category_structure in category_structures: modify_category_structure = [] for structure_name in category_structure: modify_structure_name = structure_name.replace(",", ",") modify_category_structure.append(modify_structure_name) line = (",".join(modify_category_structure)) + "\n" f.write(line.encode().decode()) def get_csv_categories(self): while True: try: html_analyse = HtmlAnalyse( "http://china.rs-online.com/web/c/pcb-prototyping/pcb-cleaning/", proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() first_categories = bs_content.find_all( name="div", attrs={"class": "horizontalMenu sectionUp"}) third_categories = [] for first_category in first_categories: first_category_name = first_category.span.text ul_tags = first_category.find_all(name="ul", attrs={"class": "column1"}) for ul_tag in ul_tags: li_tags = ul_tag.find_all(name="li") for li_tag in li_tags: second_category_url = Rs_Pre_Url + li_tag.a.get("href") second_category_name = li_tag.a.text.replace( li_tag.a.span.text, "").strip() while True: try: html_analyse = HtmlAnalyse(second_category_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() ul_tag = bs_content.find( name="ul", attrs={"class": "brcategories"}) break except Exception as e: print(sys._getframe().f_code.co_name, e, second_category_url) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() if ul_tag: third_category_tags = ul_tag.find_all( name="div", attrs={"class": "rsGARealEstate"}) for third_category_tag in third_category_tags: third_category_name = third_category_tag.a.text third_category_url = Rs_Pre_Url + third_category_tag.a.get( "href") third_category = (first_category_name, second_category_name, third_category_name, third_category_url) print(third_category) third_categories.append(third_category) else: third_category = (first_category_name, second_category_name, second_category_name, second_category_url) print(third_category) third_categories.append(third_category) return third_categories
class PdfDownload: def __init__(self, task_code): self.task_code = task_code self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() self.path = "..\\tmp\\" if not os.path.exists(self.path): os.mkdir(self.path) self.db = OracleConnection() def write(self): with open(self.path + "text.txt", 'w') as f: f.write('aaa') def get_urls_from_db(self): cursor = self.db.conn.cursor() cursor.execute( "update product$component set cmp_attach=null where cmp_attach='None'" ) # 去除与之前爬取pdf重复的 cursor.execute( "merge into product$component_crawl a using ( select cc_b2c_attach,cc_attach from product$component_crawl where cc_b2c_attach is not null group by cc_b2c_attach,cc_attach ) b on (a.cc_attach = b.cc_attach ) when matched then update set a.cc_b2c_attach = b.cc_b2c_attach where a.cc_b2c_attach is null" ) cursor.execute( "select distinct cc_attach from product$component_crawl where cc_b2c_attach is null and cc_attach is not null and cc_task=(select cct_id from product$component_crawl_task where cct_taskid='{}')" .format(self.task_code)) pdf_datas = cursor.fetchall() cursor.close() self.db.conn.commit() self.db.conn.close() pdf_urls = [] for pdf_data in pdf_datas: # if re.match(r'.*?\.pdf', pdf_data[0]): pdf_urls.append(pdf_data[0]) return pdf_urls def download(self, pdf_url): filename = self.path + str(random.random()) + '.pdf' try: html_analyse = HtmlAnalyse(pdf_url, proxy=self.proxy_ip) html_analyse.download(filename) print("下载完成。。。") except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() self.download(pdf_url) return filename def upload(self, filename, pdf_url): try: with open(filename, 'rb') as file: res = requests.post("http://10.10.100.200:9999/file/upload", files={'file': file}) res_j = res.json() print("上传完成") db = OracleConnection() cursor = db.conn.cursor() cursor.execute( "update product$component_crawl set cc_b2c_attach='{}' where cc_attach='{}'" .format(res_j['path'], pdf_url)) cursor.close() db.conn.commit() db.conn.close() except Exception as e: print(e) self.upload(filename, pdf_url) def go(self): pdf_urls = self.get_urls_from_db() for pdf_url in pdf_urls: filename = self.download(pdf_url) self.upload(filename, pdf_url) def thread_go(self): pdf_urls = self.get_urls_from_db() def thread(pdfurl): filename = self.download(pdfurl) self.upload(filename, pdfurl) threading_pool = ThreadingPool() threading_pool.multi_thread(thread, pdf_urls)
class MLCC1Detail: def __init__(self, second_class): self.first_class_name, self.second_class_name, self.url, self.page_count = second_class self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() def get_class_components(self): page_urls = map(lambda num: self.url + "&p=" + str(num), range(1, self.page_count + 1)) return page_urls def get_page_components(self, page_url): my_headers = First_Headers my_cookies = First_Cookies while True: try: my_session = requests.session() my_session.headers.update(my_headers) my_session.cookies.update(my_cookies) my_session.proxies.update(self.proxy_ip) pass except Exception as e: print(sys._getframe().f_code.co_name, e) time.sleep(1) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue try: res = my_session.get(page_url, timeout=15) content = res.content.decode() except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue if res.status_code == 200 and content: break else: self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() bs_content = BeautifulSoup(content, "lxml") product_tags = bs_content.find_all( name="li", attrs={"data-id": re.compile(r'\d+')}) many_components_properties = [] for product_tag in product_tags: all_p_tags = product_tag.find_all(name="p") try: product_code = all_p_tags[0].b.a.text except: continue product_brand = all_p_tags[0].find(name='a', id='brand_n').text product_parameter = all_p_tags[0].find(name="a", id="params").text try: product_pdf = product_tag.find(name="a", attrs={ "data-id": "pdf" }).get("href") if "http://" not in product_pdf: product_pdf = Pre_Url + product_pdf except Exception as e: print(sys._getframe().f_code.co_name, e) product_pdf = "" component = (product_code, product_brand, self.first_class_name, self.second_class_name, page_url, product_pdf, "null") properties = [ ("product_parameter", product_parameter), ] try: product_details = all_p_tags[3].find_all(name="span") except Exception as e: print(e) product_details = "" for product_detail in product_details: detail_text = product_detail.text.split(":") try: key_value = (detail_text[0], detail_text[1]) except Exception as e: print(e) key_value = (detail_text[0], "") properties.append(key_value) component_properties = (component, properties) many_components_properties.append(component_properties) return many_components_properties
class SearchList: def __init__(self): self.proxy_pool = ProxyPool() self.page_count = "" def get_all_urls(self, key_word): while True: self.proxy_ip = self.proxy_pool.get() my_session = requests.session() tianyan_headers = TianYan_Headers tianyan_headers["Referer"] = ( "http://www.tianyancha.com/search?key=" + key_word + "&checkFrom=searchBox").encode().decode('latin-1') my_session.headers.update(tianyan_headers) try: my_session.proxies.update(self.proxy_ip) except Exception as e: print(e) time.sleep(1) continue try: first_res = my_session.get( "http://www.tianyancha.com/tongji/" + key_word + ".json?random=" + str(round(time.time(), 3)).replace(".", ""), timeout=15) first_content = first_res.content first_data_v = eval(first_content)["data"]["v"] except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) continue if first_res.status_code != 200 or not first_content: self.proxy_pool.remove(self.proxy_ip) continue first_token = re.match(r".*?token=(.*?);.*?", str(bytes(eval(first_data_v)))).group(1) my_cookie = TianYan_Cookies my_cookie["token"] = first_token my_session.cookies.update(my_cookie) try: real_res = my_session.get("http://www.tianyancha.com/search/" + key_word + ".json?", timeout=15) content = real_res.content.decode() except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) continue if first_res.status_code != 200 or not content: self.proxy_pool.remove(self.proxy_ip) continue break try: json_list = json.loads(content) brief_companies = json_list["data"] except Exception as e: print(e) return while True: try: conn = MongoClient("10.10.101.22", 27017) if not brief_companies: print(key_word, "无数据") col = conn.spider.All_Company_Name col.update({"corporation": key_word}, {'$set': { "状态": "无数据" }}, multi=True) conn.close() return for brief_company in brief_companies: company_id = brief_company["id"] detail_company_url = "http://www.tianyancha.com/company/" + str( company_id) detail_company = { "company_id": company_id, "url": detail_company_url, "状态": "未完成" } detail_col = conn.spider.All_Company_Info detail_col.update({"company_id": company_id}, {'$set': detail_company}, upsert=True) col = conn.spider.All_Company_Name col.update({"corporation": key_word}, {'$set': { "状态": "已完成" }}, multi=True) print(key_word, "已完成") conn.close() break except Exception as e: print(e) continue
class DetailInfo: def __init__(self): self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() def get_detail(self, url): while True: my_session = requests.session() my_headers = TianYan_Detail_Headers my_headers["Referer"] = url my_session.headers.update(TianYan_Detail_Headers) try: my_session.proxies.update(self.proxy_ip) except Exception as e: print(e) time.sleep(1) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue try: first_res = my_session.get(url.replace("company", "tongji") + ".json?random=" + str( round(time.time(), 3)).replace(".", ""), timeout=15) first_content = first_res.content first_data_v = eval(first_content)["data"]["v"] except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue if first_res.status_code != 200 or not first_content: print(first_res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue first_token = re.match(r".*?token=(.*?);.*?", str(bytes(eval(first_data_v)))).group(1) another = re.match(r".*?\{return'(.*?)'", str(bytes(eval(first_data_v)))).group(1) def get_wtf(another): data = another.split(",") secret = "6,b,t,f,2,z,l,5,w,h,q,i,s,e,c,p,m,u,9,8,y,k,j,r,x,n,-,0,3,4,d,1,a,o,7,v,g".split(",") wtf = "" for i in data: wtf += str(secret[int(i)]) return wtf first_wtf = get_wtf(another) my_cookie = TianYan_Detail_Cookies my_cookie["token"] = first_token my_cookie["_utm"] = first_wtf my_headers["CheckError"] = "check" my_headers["Referer"] = url my_session.cookies.update(my_cookie) my_session.headers.update(my_headers) try: real_res = my_session.get(url + ".json", timeout=15) content = real_res.content.decode() except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue if real_res.status_code != 200 or not content: print(real_res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue try: json_list = json.loads(content) brief_companies = json_list["data"] break except Exception as e: print(e) conn = MongoClient("10.10.101.22", 27017) col = conn.spider.All_Company_Info if not brief_companies: print(url, "无数据") col.update({"url": url}, {'$set': {"状态": "无数据"}}) else: while True: try: col.update({"url": url}, {'$set': {"data": brief_companies, "状态": "已完成"}}) print(url, "已完成") break except Exception as e: print(e) conn.close() return
class Category: def __init__(self): self.proxy_pool = ProxyPool() def get_categories(self): main_url = "http://www.chip1stop.com/web/CHN/zh/dispClassSearchTop.do" self.proxy_ip = self.proxy_pool.get() while True: try: html_analsye = HtmlAnalyse(main_url, proxy=self.proxy_ip) bs_content = html_analsye.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() dl_tags = bs_content.find_all( name="dl", attrs={"class": "categoryListDl clearfix"}) second_categories = [] for dl_tag in dl_tags: first_directory_name = dl_tag.dt.text second_directory_tags = dl_tag.find_all(name="dd") for second_directory_tag in second_directory_tags: rough_second_directory_name = second_directory_tag.text second_directory_name = re.match( r"(.*?)\[", rough_second_directory_name).group(1).strip() second_directory_url = "http://www.chip1stop.com/web/CHN/zh" + second_directory_tag.span.a.get( "href")[1:] second_directory = (first_directory_name, second_directory_name, second_directory_url) second_categories.append(second_directory) return second_categories def get_product_list(self): categories = self.get_categories() form_data = { "nextSearchIndex": "0", "dispPageNo": "1", "dispNum": "100", "type": "page" } request_headers = { "Accept": "text/html, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Content-Type": "application/x-www-form-urlencoded", "Origin": "http://www.chip1stop.com", "Host": "www.chip1stop.com", "Proxy-Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.14 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } request_cookie = { "CK_005": "jS1czThT76C51HOUQ42UtQ06TsvRnzI105VoKAixt4s=", "CK_002": "aYWM1+FztffTlWgoCLG3iw==", "CK_001": "v1gP31jjkR0=", "CK_007": "cPDwiM71wuQ=", "CK_006": "kQp2UYR7V1g=", "CK_008": "i0dI70Swgcs=", "WMONID": "VvpmCoTZsss", "_gat": "1", "_ga": "GA1.2.1422864444.1488415703", "JSESSIONIDVERSION": "2f633173:8", "JSESSIONID": "b7d640d0a05a7885ab3cab0168cf.ap2", "JREPLICA": "c1-instance6", "id": "27e37541744912b7||t=1486458155|et=730|cs=002213fd4869c45d604be72033", "Referer": "https://www.chip1stop.com/web/CHN/zh/login.do" } complete_form_data = { "nextSearchIndex": "0", "dispPageNo": "1", "dispNum": "25", "rental": "false", "partSameFlg": "false", "subWinSearchFlg": "false", "used": "false", "newProductFlg": "false", "newProudctHandlingFlg": "false", "newSameDayShippedFlg": "false", "eventId": "0001", "searchType": "2", "dispAllFlg": "true", } def thread_go(page_no): print("Page:", page_no) page_parts = range(0, 25, 5) for page_part in page_parts: print("Part:", page_part) # def thread_go(page_part): complete_form_data['nextSearchIndex'] = page_part complete_form_data['dispPageNo'] = page_no complete_form_data['type'] = "page" detail_url = second_category_url + "&dispPageNo=%d" % page_no while True: try: my_session.cookies.update(request_cookie) res = my_session.post(detail_url, data=complete_form_data, proxies=self.proxy_ip, timeout=20) print(res.status_code) if res.status_code == 200: content = res.content.decode() bs_content = BeautifulSoup(content, "lxml") tr_tags = bs_content.find_all(name="tr")[1:] if tr_tags: break else: self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() tr_tags = bs_content.find_all(name="tr")[1:] if not tr_tags: continue # 数据库连接 orcl_conn = OracleSave(1000001) for tr_tag in tr_tags: try: code = tr_tag.td.find(name="p", attrs={ "class": "text14pt2 bold" }).text.strip() except Exception as e: print(e) continue chip1stop_code = tr_tag.td.find(name="p", attrs={ "class": "text10" }).text.strip() print(chip1stop_code) maker = tr_tag.td.find(name="p", attrs={ "class": "text10 wordBreak" }).text.strip() pdf_url = tr_tag.find( name="a", attrs={ "href": re.compile( r"http://download\.siliconexpert\.com/pdfs") }) if pdf_url: pdf_url = pdf_url.get("href") component = (code, maker, first_category_name, second_category_name, second_category_url, pdf_url, None) orcl_conn.component_insert(component) property_tags = tr_tag.find_all(name="td")[6:-1] for property_name, property_tag in zip( property_names, property_tags): if property_name == '购买/询价': continue property_value = property_tag.text if property_value: property_value = property_value.strip() single_property = (property_name, property_value) orcl_conn.properties_insert(single_property) orcl_conn.commit() orcl_conn.conn.close() # def thread_go(page_no): for category in categories: first_category_name, second_category_name, second_category_url = category while True: try: my_session = requests.session() my_session.headers.update(request_headers) self.proxy_ip = self.proxy_pool.get() res = my_session.post(second_category_url, data=form_data, proxies=self.proxy_ip, timeout=10) print(res.status_code) if res.status_code == 200: break self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_pool.remove(self.proxy_ip) content = res.content.decode() bs_content = BeautifulSoup(content, "lxml") products_count = bs_content.find(name="span", attrs={ "class": "bold_red" }).text.replace(",", "").replace( "件", "") table_headers_tag = bs_content.find(name="tr", attrs={"class": "parent"}) property_name_tags = table_headers_tag.find_all(name="td") # 器件参数名称 property_names = [] for property_name_tag in property_name_tags[6:-1]: property_name = property_name_tag.text.strip() property_names.append(property_name) # 总页数 pages_count = int(int(products_count) / 25) + 1 if pages_count > 400: pages_count = 400 # for page_no in range(1, pages_count + 1): # ---------------------------我是分割线---------------------------- # threading_pool = ThreadingPool(10) # threading_pool.multi_process(thread_go, list(range(1, pages_count + 1))) for i in range(1, pages_count + 1): thread_go(i)
class FPNewark: def __init__(self): self.my_session = requests.session() self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() pass def get_category_trees(self, category_trees): multi_category_trees = [] for category_tree in category_trees: url = category_tree[-2] count = 0 while True: try: self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(url, timeout=20) if res.status_code != 200: print(res.status_code) continue bs_content = BeautifulSoup(res.content, "lxml") break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, url, e) self.proxy_ip = self.proxy_pool.get() if count > 100: self.proxy_pool._refresh() category_list = bs_content.find(name="ul", attrs={"class": "categoryList"}) if not category_list: print(category_tree) multi_category_trees.append(category_tree) continue else: child_category_tags = category_list.find_all(name="a") category_trees = [] for child_category_tag in child_category_tags: child_category_url = child_category_tag.get("href") rough_child_category_tag = child_category_tag.text.strip() flag = re.match(r"(.*?) \((\d+.*?)\)", rough_child_category_tag) child_category_name = flag.group(1) component_count = flag.group(2).replace(",", "") if component_count == '1': continue child_category = [ child_category_name, child_category_url, component_count ] child_category_tree = list( category_tree)[:-2] + child_category category_trees.append(child_category_tree) child_categories = self.get_category_trees(category_trees) print(child_categories) multi_category_trees += child_categories print("Current Count: ", len(multi_category_trees)) return multi_category_trees def get_first_categories(self): my_headers = Default_Header my_headers["host"] = "www.newark.com" my_headers["Referer"] = "http://www.newark.com/" my_headers["Upgrade-Insecure-Requests"] = "1" while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get( "http://www.newark.com/browse-for-products", timeout=20) if res.status_code != 200: print(res.status_code) continue bs_content = BeautifulSoup(res.content, "lxml") first_category_tags = bs_content.find_all( name="ul", attrs={"categoryList"}) break except Exception as e: print("Part1", sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() second_pages = [] for first_category_tag in first_category_tags: first_category_name = first_category_tag.li.h2.text.strip() second_category_tags = first_category_tag.li.ul.find_all(name="li") for second_category_tag in second_category_tags: second_category_url = second_category_tag.a.get("href") rough_second_category_name = second_category_tag.text.strip() flag = re.match(r"(.*?) \((\d+.*?)\)", rough_second_category_name) second_category_name = flag.group(1) component_count = flag.group(2).replace(",", "") if component_count == '1': continue second_page = (first_category_name, second_category_name, second_category_url, component_count) second_pages.append(second_page) return second_pages def csv_write(self, category_structures): with open("..\\Newark_test.csv", "w", encoding="utf-8") as f: for category_structure in category_structures: modify_category_structure = [] for structure_name in category_structure: modify_structure_name = structure_name.replace(",", ",") modify_category_structure.append(modify_structure_name) line = (",".join(modify_category_structure)) + "\n" f.write(line.encode().decode()) # def thread_go(self, category_tree): my_headers = Default_Header my_headers["host"] = "www.newark.com" my_headers["Referer"] = "http://www.newark.com/" my_headers["Upgrade-Insecure-Requests"] = "1" first_category_name = category_tree[0] second_category_name = str(category_tree[1:-2]) url, component_count = category_tree[-2:] page_count = int(int(component_count) / 25) + 1 for page_num in range(877, page_count + 1): page_url = url + "/prl/results/" + str(page_num) count = 0 while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find( name="table", id="sProdList").tbody.find_all(name="tr") break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 10: print(category_tree, page_url) component_tags = [] break if count > 100: self.proxy_pool._refresh() for component_tag in component_tags: detail_table = component_tag.find(name="table", attrs={"class": "TFtable"}) td_tags = component_tag.find_all(name="td") try: component_code = td_tags[1].text.strip() except Exception as e: print("component code is None", e) continue try: component_img = td_tags[1].find(name="img", attrs={ "class": "productThumbnail" }).get("src") except: component_img = "" try: rough_attach = td_tags[2].find(name="a", text="数据表") if not rough_attach: rough_attach = td_tags[2].find( name="a", attrs={"class": "prodDetailsAttachment"}) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: component_attach = "" try: manufacture_description = td_tags[3].a.find_all(name="p") component_brand = manufacture_description[0].text.strip() component_description = manufacture_description[ 1].text.strip() except Exception as e: component_brand = "" print(sys._getframe().f_code.co_name, e) continue if not component_img and not component_attach and not component_brand: continue component = (component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 while True: try: orcl_conn = OracleSave(1000003) orcl_conn.component_insert(component) if detail_table: property_tags = detail_table.find_all(name="tr") for property_tag in property_tags: detail_td_tags = property_tag.find_all("td") property_name = detail_td_tags[0].text.strip() property_value = detail_td_tags[1].text.strip() key_value = (property_name, property_value) orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e) count += 1 # if count > 3: # break def extra_go(self, category_tree): my_headers = Default_Header my_headers["host"] = "www.newark.com" my_headers["Referer"] = "http://www.newark.com/" my_headers["Upgrade-Insecure-Requests"] = "1" first_category_name = category_tree[0] second_category_name = str(category_tree[1:-2]) url, component_count = category_tree[-2:] page_count = int(int(component_count) / 25) + 1 page_range = range(875, 17557) def extra_thread(page_num): page_url = url + "/prl/results/" + str(page_num) count = 0 while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find( name="table", id="sProdList").tbody.find_all(name="tr") break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 10: print(category_tree, page_url) component_tags = [] break if count > 100: self.proxy_pool._refresh() for component_tag in component_tags: detail_table = component_tag.find(name="table", attrs={"class": "TFtable"}) td_tags = component_tag.find_all(name="td") try: component_code = td_tags[1].text.strip() except Exception as e: print("component code is None", e) continue try: component_img = td_tags[1].find(name="img", attrs={ "class": "productThumbnail" }).get("src") except: component_img = "" try: rough_attach = td_tags[2].find(name="a", text="数据表") if not rough_attach: rough_attach = td_tags[2].find( name="a", attrs={"class": "prodDetailsAttachment"}) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: component_attach = "" try: manufacture_description = td_tags[3].a.find_all(name="p") component_brand = manufacture_description[0].text.strip() component_description = manufacture_description[ 1].text.strip() except Exception as e: component_brand = "" print(sys._getframe().f_code.co_name, e) continue if not component_img and not component_attach and not component_brand: continue component = (component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 while True: try: orcl_conn = OracleSave(1000003) orcl_conn.component_insert(component) if detail_table: property_tags = detail_table.find_all(name="tr") for property_tag in property_tags: detail_td_tags = property_tag.find_all("td") property_name = detail_td_tags[0].text.strip() property_value = detail_td_tags[1].text.strip() key_value = (property_name, property_value) orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e) count += 1 # if count > 3: # break extra_threading = ThreadingPool(8) extra_threading.multi_process(extra_thread, page_range) def read_from_csv(self): csv_categories = [] with open("..\\Newark_test.csv", "r", encoding="utf-8") as f: read = csv.reader(f) for line in read: print(line) csv_categories.append(line) return csv_categories
class DownloadUpload: def __init__(self): self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() pass def file_download(self, url, file_type, file_name=str(random.random())): download_dir_path = "..\\download_files\\" if not os.path.exists(download_dir_path): os.mkdir(download_dir_path) download_file_path = download_dir_path + file_name + file_type if os.path.exists(download_file_path): return try_count = 0 while True: try: download_file_path = download_dir_path + str(random.random()) + file_type # html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip) my_session = requests.session() my_headers = {'Connection': 'Keep-Alive', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', "host": "www.newark.com", "Referer": "http://www.newark.com/", "Upgrade-Insecure-Requests": "1"} my_session.headers.update(my_headers) my_session.proxies.update(self.proxy_ip) res = my_session.get(url) if res.status_code != 200 or not res.content: print(sys._getframe().f_lineno) continue with open(download_file_path, 'wb') as f: f.write(res.content) print("File Download Success !!") break except Exception as e: print(sys._getframe().f_code.co_name, url, e) try_count += 1 # if try_count > 2 and "https" in url: # return # if try_count > 5: # return self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() # download_file_path = download_dir_path + str(random.random()) + file_type return download_file_path def file_upload(self, local_file_path): if not local_file_path: return while True: try: with open(local_file_path, "rb") as f: res = requests.post(File_Server_Url, files={'file': f}) if res.status_code == 200: res_j = res.json() break except Exception as e: print(sys._getframe().f_code.co_name, e) server_file_path = res_j["path"] print("File Server Upload Success !!") return server_file_path def download_upload(self, url, file_type): download_file_path = self.file_download(url, file_type) server_file_path = self.file_upload(download_file_path) return server_file_path
"Content-Type": "application/x-www-form-urlencoded", "Origin": "http://www.chip1stop.com", "Referer": "http://www.chip1stop.com/web/CHN/zh/search.do?classCd=040101&classLv=3&subWinSearchFlg=false&searchType=2&dispAllFlg=true&searchFlg=false", "Host": "www.chip1stop.com", "Proxy-Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.14 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } if __name__ == "__main__": proxy_pool = ProxyPool() proxy_ip = proxy_pool.get() my_session = requests.session() my_session.headers.update(request_headers) while True: try: res = my_session.post( "http://www.chip1stop.com/web/CHN/zh/search.do?", data=form_data, proxies=proxy_ip) print(res.status_code) break except Exception as e: print(e) proxy_pool.remove(proxy_ip) proxy_ip = proxy_pool.get() content = res.content.decode() print(content)
class PdfDownload1: def __init__(self, task_code): self.task_code = task_code self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() self.path = "D:\pdf\\" if not os.path.exists(self.path): os.mkdir(self.path) self.db = OracleConnection() def write(self): with open(self.path + "text.txt", 'w') as f: f.write('aaa') def get_urls_from_db(self): cursor = self.db.conn.cursor() cursor.execute( "update product$component set cmp_attach=null where cmp_attach='None'" ) # 去除与之前爬取pdf重复的 cursor.execute( "merge into product$component_crawl a using ( select cc_b2c_attach,cc_attach from product$component_crawl where cc_b2c_attach is not null group by cc_b2c_attach,cc_attach ) b on (a.cc_attach = b.cc_attach ) when matched then update set a.cc_b2c_attach = b.cc_b2c_attach where a.cc_b2c_attach is null" ) cursor.execute( "select distinct cc_attach from product$component_crawl where cc_b2c_attach is null and cc_attach is not null and cc_task=(select cct_id from product$component_crawl_task where cct_taskid='{}')" .format(self.task_code)) pdf_datas = cursor.fetchall() cursor.close() self.db.conn.commit() self.db.conn.close() pdf_urls = [] for pdf_data in pdf_datas: # if re.match(r'.*?\.pdf', pdf_data[0]): pdf_urls.append(pdf_data[0]) return pdf_urls def download(self, pdf_url): content_list = re.match(r'downloadLinkClick\((.*?)\);return false', a).group(1).split(",") filename = content_list[0].replace("'", "") url = "http://ds.yuden.co.jp/TYCOMPAS/cs/detail.do?mode=download&fileName=" + filename isSeriesData = content_list[1] isProductsData = content_list[2] isProductsDataGraph = content_list[3] DownloadForm = { "action": "detail.do", "classificationID": "AE", "fileName": filename, "isSeriesData": isSeriesData, "isProductsData": isProductsData, "isProductsDataGraph": isProductsDataGraph } html_analyse = HtmlAnalyse(url) html_analyse.post_download( data=DownloadForm, path="I:\PythonPrj\StandardSpider\DataAnalyse\\NewRules\\a.pdf") filename = self.path + str(random.random()) + '.pdf' try: html_analyse = HtmlAnalyse(url, proxy=self.proxy_ip) html_analyse.download(filename) print("下载完成。。。") except Exception as e: print(e) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() self.download(pdf_url) return filename def upload(self, filename, pdf_url): try: with open(filename, 'rb') as file: res = requests.post("http://10.10.100.200:9999/file/upload", files={'file': file}) res_j = res.json() print("上传完成") db = OracleConnection() cursor = db.conn.cursor() cursor.execute( "update product$component_crawl set cc_b2c_attach='{}' where cc_attach='{}'" .format(res_j['path'], pdf_url)) cursor.close() db.conn.commit() db.conn.close() except Exception as e: print(e) self.upload(filename, pdf_url) def go(self): pdf_urls = self.get_urls_from_db() for pdf_url in pdf_urls: filename = self.download(pdf_url) self.upload(filename, pdf_url) def thread_go(self): pdf_urls = self.get_urls_from_db() def thread(pdfurl): filename = self.download(pdfurl) self.upload(filename, pdfurl) threading_pool = ThreadingPool() threading_pool.multi_thread(thread, pdf_urls)