def thread_go(self): pdf_urls = self.get_urls_from_db() def thread(pdfurl): filename = self.download(pdfurl) self.upload(filename, pdfurl) threading_pool = ThreadingPool() threading_pool.multi_thread(thread, pdf_urls)
def all_go(self): get_urls = GetUrls() first_classes = get_urls.get_first_classes() for first_class in first_classes: second_classes = get_urls.get_second_classes(first_class) for second_class in second_classes: self.detail = MLCC1Detail(second_class) page_urls = self.detail.get_class_components() threadingpool = ThreadingPool(4) threadingpool.multi_process(self.thread_go, page_urls)
def get_suppliers(self): def thread_go(page_url): html_analyse = HtmlAnalyse(page_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(e) company_tags = bs_content.find_all(name="a", attrs={ "target": "_blank", "href": re.compile(r"/\d+") }) corporations = [] for company_tag in company_tags: corporation = company_tag.text.strip() corporation_dict = { "corporation": corporation, "province_url": city_url, "page_url": page_url, "状态": "未完成", "from": "99114" } corporations.append(corporation) col = self.db.All_Company_Name col.insert(corporation_dict) print(corporations) return corporations html_analyse = HtmlAnalyse("http://shop.99114.com/") bs_content = html_analyse.get_bs_contents() all_city_tags = bs_content.find_all( name="a", attrs={"href": re.compile(r"http://shop\.99114\.com/list/area")}) for city_tag in all_city_tags: city_url = city_tag.get("href") html_analyse = HtmlAnalyse(city_url) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find_all( name="a", attrs={"href": re.compile(r"/list/area/")})[-2] page_count = int(page_tag.text.replace(",", "")) page_urls = map( lambda page_num: city_url[:-1] + str(page_num) + ".html", range(1, page_count + 1)) # for page_url in page_urls: # thread_go(page_url) threading_pool = ThreadingPool(12) threading_pool.multi_process(thread_go, page_urls)
def get_suppliers(self): def thread_go(page_url): html_analyse = HtmlAnalyse(page_url) while True: try: bs_content = html_analyse.get_bs_contents() break except Exception as e: print(e) ul_tag = bs_content.find(name="div", attrs={"class": "leftbox comlist"}) li_tags = ul_tag.find_all(name="li") corporations = [] for li_tag in li_tags: corporation = li_tag.text.strip() corporation_dict = { "corporation": corporation, "province_url": province_url, "page_url": page_url, "状态": "未完成" } corporations.append(corporation) col = self.db.All_Company_Name col.insert(corporation_dict) print(corporations) return corporations for province_id in range(1, 36): province_url = "http://www.soudh.com/province-" + str( province_id) + ".html" html_analyse = HtmlAnalyse(province_url) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="span", text=re.compile(r'当前为')) page_count = int(re.match(r'.*?共(\d+)页', page_tag.text).group(1)) page_urls = map( lambda page_num: province_url[:-5] + "-" + str(page_num) + ".html", range(1, page_count + 1)) # # for page_url in page_urls: # thread_go(page_url) threading_pool = ThreadingPool() threading_pool.multi_thread(thread_go, page_urls)
orcl_conn = OracleSave(1000002) orcl_conn.component_insert(component) for key_value in property_key_values: orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print("database save exception", e) count += 1 # if count > 3: # break if __name__ == "__main__": mouser_go = MouserGo() # multi_category_structures = mouser_go.get_all_category() # mouser_go.category_to_csv(multi_category_structures) init_multi_category_trees = mouser_go.read_from_csv() # multi_category_trees = init_multi_category_trees[270:271] for i in range(271, 986, 5): multi_category_trees = init_multi_category_trees[i: i+5] pages_category = mouser_go.get_page_url(multi_category_trees) # print(pages_category) # for page_category in pages_category: # mouser_go.page_thread_go(page_category) threadingpool = ThreadingPool(16) threadingpool.multi_process(mouser_go.page_thread_go, pages_category)
def get_page_url(self, second_category): first_category_name, second_category_name, second_category_url = second_category while True: try: html_analyse = HtmlAnalyse(second_category_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() ul_tag = bs_content.find(name="ul", attrs={"class": "brcategories"}) third_category_tags = ul_tag.find_all( name="div", attrs={"class": "rsGARealEstate"}) for third_category_tag in third_category_tags: third_category_name = third_category_tag.a.text third_category_url = Rs_Pre_Url + third_category_tag.a.get("href") while True: try: html_analyse = HtmlAnalyse(third_category_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() try: page_tag = bs_content.find(name="div", attrs={ "class": "viewProdDiv" }).text except Exception as e: print(third_category_url, e, "找不到page_tag") continue flag = re.match(r".*?共(.*?)个", page_tag) page_count = int(int(flag.group(1).strip()) / 20 + 1) for page_num in range(int(page_count)): page_url = third_category_url + "?pn=" + str(page_num + 1) while True: try: html_analyse = HtmlAnalyse(page_url, proxy=self.proxy_ip) bs_content = html_analyse.get_bs_contents() break except Exception as e: print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() component_url_tags = bs_content.find_all( name="a", attrs={"class": "tnProdDesc"}) page_attributes = [] for component_url_tag in component_url_tags: component_url = Rs_Pre_Url + component_url_tag.get("href") union_category_name = second_category_name + "---" + third_category_name page_attribute = (first_category_name, union_category_name, component_url) page_attributes.append(page_attribute) # threadingpool = ThreadingPool(4) threadingpool.multi_process(self.thread_go, page_attributes) # for page_attribute in page_attributes: # self.thread_go(page_attribute) continue
col.update({"corporation": key_word}, {'$set': { "状态": "已完成" }}, multi=True) print(key_word, "已完成") conn.close() break except Exception as e: print(e) continue # 470 if __name__ == "__main__": socket.setdefaulttimeout(30) mongo_conn = MongoClient("10.10.101.22", 27017) col = mongo_conn.spider.All_Company_Name search_list = SearchList() key_words = [] for data in col.find({ "状态": "未完成", "province_url": "http://www.soudh.com/province-6.html" }): key_word = data["corporation"] # search_list.get_all_urls(key_word) key_words.append(key_word) threadingpool = ThreadingPool(300) threadingpool.multi_process(search_list.get_all_urls, key_words)
return [[i] for i in ls] else: j = int(ls_len / n) k = ls_len % n ### j,j,j,...(前面有n-1个j),j+k # 步长j,次数n-1 ls_return = [] for i in range(0, (n - 1) * j, j): ls_return.append(ls[i:i + j]) # 算上末尾的j+k ls_return.append(ls[(n - 1) * j:]) return ls_return if __name__ == "__main__": while True: rows = get_component2() try: if len(rows) > 19: ls_return = div_list(rows, 20) threadingpool = ThreadingPool(4) threadingpool.multi_process(thread_go, ls_return) else: thread_go(rows) except Exception as e: print(e) if len(rows) == 0: break # for ls_ret in ls_return: # thread_go(ls_ret)
cursor.execute("delete from ac$us$detail where name='{}'".format( init_row[1])) elif row == init_row: cursor.execute( "update ac$us$detail set modifystatus=0 where name='{}'". format(init_row[1])) else: sql = "update ac$us$detail set adminname='{}',name='{}',shortname='{}',industry='{}',tel='{}',address='{}',type='{}',modifystatus=1 where name='{}'".format( *row, init_row[1].replace("'", "''")) cursor.execute(sql) print(row, init_row, (row == init_row)) cursor.close() conn.commit() conn.close() if __name__ == "__main__": while True: try: data_inside = DataInside() rows = data_inside.get_data() if not rows: break # for row in rows: # data_inside.modify_data(row) threadingpool = ThreadingPool(8) threadingpool.multi_process(data_inside.modify_data, rows) except Exception as e: print(sys._getframe().f_code.co_name)
else: while True: try: col.update({"url": url}, {'$set': {"data": brief_companies, "状态": "已完成"}}) print(url, "已完成") break except Exception as e: print(e) conn.close() return if __name__ == "__main__": socket.setdefaulttimeout(30) mongo_conn = MongoClient("10.10.101.22", 27017) col = mongo_conn.spider.All_Company_Info # detail_info.get_detail("http://www.tianyancha.com/company/2546208953") # urls = [] for data in col.find({"状态": "未完成"}): url = data["url"] # detail_info.get_detail(url) urls.append(url) detail_info = DetailInfo() threadingpool = ThreadingPool(800) threadingpool.multi_process(detail_info.get_detail, urls) # Valentine's Day
def extra_go(self, category_tree): my_headers = Default_Header my_headers["host"] = "www.newark.com" my_headers["Referer"] = "http://www.newark.com/" my_headers["Upgrade-Insecure-Requests"] = "1" first_category_name = category_tree[0] second_category_name = str(category_tree[1:-2]) url, component_count = category_tree[-2:] page_count = int(int(component_count) / 25) + 1 page_range = range(875, 17557) def extra_thread(page_num): page_url = url + "/prl/results/" + str(page_num) count = 0 while True: try: self.my_session.headers.update(my_headers) self.my_session.proxies.update(self.proxy_ip) res = self.my_session.get(page_url, timeout=20) if res.status_code != 200: print(res.status_code) self.proxy_pool.remove(self.proxy_ip) self.proxy_ip = self.proxy_pool.get() continue bs_content = BeautifulSoup(res.content, "lxml") component_tags = bs_content.find( name="table", id="sProdList").tbody.find_all(name="tr") break except Exception as e: count += 1 print(sys._getframe().f_code.co_name, e) self.proxy_ip = self.proxy_pool.get() if count > 10: print(category_tree, page_url) component_tags = [] break if count > 100: self.proxy_pool._refresh() for component_tag in component_tags: detail_table = component_tag.find(name="table", attrs={"class": "TFtable"}) td_tags = component_tag.find_all(name="td") try: component_code = td_tags[1].text.strip() except Exception as e: print("component code is None", e) continue try: component_img = td_tags[1].find(name="img", attrs={ "class": "productThumbnail" }).get("src") except: component_img = "" try: rough_attach = td_tags[2].find(name="a", text="数据表") if not rough_attach: rough_attach = td_tags[2].find( name="a", attrs={"class": "prodDetailsAttachment"}) component_attach = rough_attach.get("href") if "http" not in component_attach: component_attach = "" except Exception as e: component_attach = "" try: manufacture_description = td_tags[3].a.find_all(name="p") component_brand = manufacture_description[0].text.strip() component_description = manufacture_description[ 1].text.strip() except Exception as e: component_brand = "" print(sys._getframe().f_code.co_name, e) continue if not component_img and not component_attach and not component_brand: continue component = (component_code, component_brand, first_category_name, second_category_name, page_url, component_attach, component_img) count = 0 while True: try: orcl_conn = OracleSave(1000003) orcl_conn.component_insert(component) if detail_table: property_tags = detail_table.find_all(name="tr") for property_tag in property_tags: detail_td_tags = property_tag.find_all("td") property_name = detail_td_tags[0].text.strip() property_value = detail_td_tags[1].text.strip() key_value = (property_name, property_value) orcl_conn.properties_insert(key_value) orcl_conn.commit() orcl_conn.conn.close() break except Exception as e: print(e) count += 1 # if count > 3: # break extra_threading = ThreadingPool(8) extra_threading.multi_process(extra_thread, page_range)