def get_urls_pdfs(self): with open("I:\PythonPrj\StandardSpider\Spider\Rohm\ChipResistorNetworks\htmlcode.html", "r", encoding="utf-8") as f: content = f.read() bs_content = BeautifulSoup(content, "html.parser") all = bs_content.find_all(name="td", attrs={"align": "left", "class": "part-name PartNumber"}) pdfs_urls = [] for one in all: tag_url_pdf = one.find_all(name="div") url_code = tag_url_pdf[0].a code = url_code.text orcl_con = OracleConnection() cursor = orcl_con.conn.cursor() cursor.execute("select cc_id from product$component_crawl where cc_code='{}'".format(code)) data = cursor.fetchone() if data: print("repeat") continue cursor.close() orcl_con.conn.close() url = Rohm_Pre_Url + url_code.get("href") try: pdf = tag_url_pdf[1].a.get("link") except: pdf = "" pdf_url = (code, url, pdf,) pdfs_urls.append(pdf_url) return pdfs_urls
def get_product_list(self): series_contents = self.get_all_content() urls = [] codes = [] for series_content in series_contents: rough_urls_codes = series_content.find_all( name="a", attrs={"href": re.compile(r'/ac/c/search_num/index\.jsp')}) for rough_url_code in rough_urls_codes: code = rough_url_code.text orcl_con = OracleConnection() cursor = orcl_con.conn.cursor() cursor.execute( "select cc_id from product$component_crawl where cc_code='{}'" .format(code)) data = cursor.fetchone() if data: print("repeat") continue cursor.close() orcl_con.conn.close() rough_url = rough_url_code.get("href") re_url = re.match( r'(/ac/c/search_num/index\.jsp).*?(\?c=detail&part_no=.*$)', rough_url) url = Pre_Panasonic_Device_Url + re_url.group( 1) + re_url.group(2) codes.append(code) urls.append(url) return urls, codes
def get_task_id(self): orcl_conn = OracleConnection() cursor = orcl_conn.conn.cursor() cursor.execute( "select cct_id from product$component_crawl_task where cct_taskid='{}'".format(self.task_code)) task_id = cursor.fetchone()[0] cursor.close() return task_id
def __init__(self, task_code): self.task_code = task_code self.proxy_pool = ProxyPool() self.proxy_ip = self.proxy_pool.get() self.path = "..\\tmp\\" if not os.path.exists(self.path): os.mkdir(self.path) self.db = OracleConnection()
def get_code_urls(self, series_url): def get_pages_urls(url): html_analyse = HtmlAnalyse(url, is_proxy=True) bs_content = html_analyse.get_bs_contents() page_tag = bs_content.find(name="a", attrs={"title": "到最后一页"}, text="末页 »") try: rough_page = page_tag.get("href") page = re.match(r"/ea/products/.*?page=(\d+)&reset=1", rough_page).group(1) except: page = 0 page_urls = [] for i in range(int(page) + 1): page_url = url + "&page=" + str(i) page_urls.append(page_url) return page_urls product_urls = [] page_urls = get_pages_urls(series_url) if page_urls is None: return None for page_url in page_urls[:]: html_analyse = HtmlAnalyse(page_url) bs_contents = html_analyse.get_bs_contents() lists = bs_contents.find_all( name='tr', attrs={"class": re.compile(u"(^odd$)|(^even$)")}) if not lists: continue for list in lists[1:]: try: model = list.td.a code = model.text except: break # *******去重******* orcl_con = OracleConnection() cursor = orcl_con.conn.cursor() cursor.execute( "select cc_id from product$component_crawl where cc_code='{}'" .format(code)) data = cursor.fetchone() if data: print("repeat") continue cursor.close() orcl_con.conn.close() # *******结束******* href = model.get("href") url = Panasonic_Pre_Url + href product_urls.append(url) return product_urls
def get_task_id(self): orcl_conn = OracleConnection() cursor = orcl_conn.conn.cursor() cursor.execute( "select cct_id from product$component_crawl_task where cct_taskid='{}'" .format(self.task_code)) try: task_id = cursor.fetchone()[0] cursor.close() return task_id except: print("数据为空,请检查任务号,程序即将关闭") time.sleep(3)
def upload(self, filename, pdf_url): try: with open(filename, 'rb') as file: res = requests.post("http://10.10.100.200:9999/file/upload", files={'file': file}) res_j = res.json() print("上传完成") db = OracleConnection() cursor = db.conn.cursor() cursor.execute( "update product$component_crawl set cc_b2c_attach='{}' where cc_attach='{}'".format(res_j['path'], pdf_url)) cursor.close() db.conn.commit() db.conn.close() except Exception as e: print(e)
def kill_session(): conn = OracleConnection() cursor = conn.conn.cursor() cursor.execute( "select sess.sid,sess.serial#,lo.oracle_username,lo.os_user_name,ao.object_name,lo.locked_mode from v$locked_object lo, dba_objects ao, v$session sess where ao.object_id = lo.object_id and lo.session_id = sess.sid and os_user_name='zhanghy'" ) sessions = cursor.fetchall() for session in sessions: sid = session[0] serial = session[1] cursor = conn.conn.cursor() try: sql = "alter system kill session '{},{}'".format(sid, serial) cursor.execute(sql) cursor.close() except Exception as e: print(e) continue print("kill success") conn.conn.close()
def __init__(self): self.path = "I:\PythonPrj\StandardSpider\\tmp\\" if not os.path.exists(self.path): os.mkdir(self.path) self.db = OracleConnection()
""" @description: @author: RoyalClown @date: 2016/11/16 """ from Lib.DBConnection.OracleConnection import OracleConnection conn = OracleConnection() cursor = conn.conn.cursor() cursor.execute( "select sess.sid,sess.serial#,lo.oracle_username,lo.os_user_name,ao.object_name,lo.locked_mode from v$locked_object lo, dba_objects ao, v$session sess where ao.object_id = lo.object_id and lo.session_id = sess.sid and os_user_name='RoyalClown'" ) sessions = cursor.fetchall() for session in sessions: sid = session[0] serial = session[1] cursor = conn.conn.cursor() sql = "alter system kill session '{},{}'".format(sid, serial) cursor.execute(sql) cursor.close() conn.conn.commit() print("kill success")
def __init__(self, task_code): self.task_code = task_code self.path = "C:\img\\" if not os.path.exists(self.path): os.mkdir(self.path) self.db = OracleConnection()