def parse_one_page(self, *params): # 3(内盘PTA) 4(外盘PTA) 5(内盘MEG现货) 6(外盘MEG) 9(涤纶DTY 150D/48F低弹) 10(直纺半光POY 150D/48F) 12(直纺半光FDY 150D/96F) # 13(1.4D直纺涤短) 14(半光聚酯水平片) 15(华东聚酯水平片) 39(有光聚酯切片) counter=1 while counter<=2: try: for i in (3, 4, 5, 6, 9, 10, 12, 13, 14, 15, 39): # 3, 4, 5, 6, 9, 10, 12 ,13 ,14 ,15 data = { "Monitor_IDs": "a_210000_3,a_210000_4,a_220000_5,a_220000_6", "monitorId": i, "startdate": params[0][0], "enddate": params[0][1], "type": "dd" } url = 'http://www.ccf.com.cn/dynamic_graph/getPrice.php' cookies = ast.literal_eval(params[0][3]) response = self.post_one_page(url, self._headers, data, cookies) html2 = etree.HTML(response) sleeptime = random.randint(15, 30) print('\033[37;40m-------------------获取指标%s价格数据休眠%s秒------------------\033[0m' % (i, sleeptime)) time.sleep(sleeptime) if str(params[0][0])[:7] == str(params[0][1])[:7]: for i in (2, 3): a = '//div[@class="box_products_txt"]/table/tr[%s]/td[1]/text()' % i b = '//div[@class="box_products_txt"]/table/tr[%s]/td[2]/text()' % i c = '//div[@class="box_products_txt"]/table/tr[%s]/td[3]/text()' % i index = html2.xpath(a) erdat = html2.xpath(b) sl = html2.xpath(c) if index==[]: break yield { 'index': str(index[0]), 'erdat': str(erdat[0].strip()[0:4] + erdat[0].strip()[5:7] + erdat[0].strip()[8:10]), 'sl': int(sl[0]) } else: a = '//div[@class="box_products_txt"]/table/tr[%s]/td[1]/text()' % 2 b = '//div[@class="box_products_txt"]/table/tr[%s]/td[2]/text()' % 2 c = '//div[@class="box_products_txt"]/table/tr[%s]/td[3]/text()' % 2 index = html2.xpath(a) erdat = html2.xpath(b) sl = html2.xpath(c) for i in range(0, len(index)): yield { 'index': str(index[i]), 'erdat': str(erdat[i].strip()[0:4] + erdat[i].strip()[5:7] + erdat[i].strip()[8:10]), 'sl': int(sl[i]) } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到CCF价格数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 6) counter = counter + 1
def question_search(date): file_dir = 'D:\\各装置主要生产日指标汇总' path = file_dir + '\%s 各装置主要生产日指标汇总表.pdf' % date # 定义pdf页数,并读取当页表格 with pdfplumber.open(path) as pdf: first_page = pdf.pages[0] for table in first_page.extract_tables(): # 这个pdf暂时只有一张表 df = pd.DataFrame(table[5:]) values = df.values # 获取每行值 for i in range(0, len(values)): if re.search(str(values[i]), '#'): # 查找是否出现#字符 wechat_auto().send_mesg(0, 'person', 0) return '出现特殊字符' return '未出现特殊字符'
def parse_one_page(self, *params): # kczs:CCF库存指数 292000/POY库存 290000/FDY库存 291000/DTY库存 280000/涤纶短纤库存 # fhzs:CCG负荷指数 210000/PTA负荷 222000/MEG负荷(总) 223000/MEG煤制负荷 220000/聚酯负荷 # 230000/直纺长丝负荷 240000/直纺短纤负荷 274000/聚酯瓶片负荷 # xyzs:CCF下游指数 110000/江浙纺机开机率 150000/江浙加弹开机率 counter = 1 while counter <= 2: try: sj = (['kczs', 292000], ['kczs', 290000], ['kczs', 291000], ['kczs', 280000], ['fhzs', 210000], ['fhzs', 222000], ['fhzs', 223000], ['fhzs', 220000], ['fhzs', 230000], ['fhzs', 240000], ['fhzs', 274000], ['xyzs', 110000], ['xyzs', 150000]) for l in range(0, len(sj)): data = { "ProdClass": sj[l][0], "ProdID": sj[l][1], "startDate": "2019-06-01", "endDate": params[0][1] } url = 'http://www.ccf.com.cn/dynamic_graph/index.php' cookies = ast.literal_eval(params[0][3]) response = self.post_one_page(url, {}, data, cookies) sleeptime = random.randint(10, 15) print('\033[37;40m-------------------获取库存数据休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) pattern = re.compile( '<tr .*?><td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td>.*?<td align=center>(.*?)</td></tr>', re.S) items = re.findall(pattern, response) for item in items: if item == []: print('\033[1;37;41m----------------------总库存逻辑异常-----------------------\033[0m') else: yield { 'index': item[0], 'erdat': item[1].strip()[0:4] + item[1].strip()[5:7] + item[1].strip()[8:10], 'sl': item[2] } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到库存指数数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 10) counter = counter + 1
def parse_one_page(self, *params): data = { '__RequestVerificationToken':params[0][5][1], 'StartDate': str(params[0][0]).replace('-', '/'), 'EndDate': str(params[0][1]).replace('-', '/'), "FrequencyCode": "Daily", "IncludePrePublishedPrices": "false", "SelectedQuotes[0][Id]": "{i}petchem/8602792", "SelectedQuotes[0][PriceOption]": "Average", "SelectedQuotes[0][UsePrimaryYAxis]": "true", "PrimaryYAxisUnitCode": "ZZZ", "PrimaryYAxisCurrencyCode": "ZZZ", "SecondaryYAxisUnitCode": "ZZZ", "SecondaryYAxisCurrencyCode": 'ZZZ', "isFormulaRequest": "false", "preEntitledWorkspaceName":'', } headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14", "X-Requested-With": "XMLHttpRequest", } counter = 1 while counter <= 2: try: url = 'https://www.icis.com/Dashboard/PurchasedPriceHistory/DisplayChartDualYAxis' response3 = requests.post(url, data=data, headers=headers, cookies=params[0][5][0], timeout=120) a = response3.json() c = a['chartLines'][0]['pointList'] for i in range(0, len(c)): x, y = c[i]['pointPrice'], c[i]['pointDateString'].replace('-','') yield { 'index': '冰醋酸中间价', 'erdat': str(y), 'sl': x } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到冰醋酸中间价-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 12) counter = counter + 1
def parse_one_page(self, *params): # 获取库存/港口数据 counter = 1 while counter <= 2 : try: for j in ({'1'}): # 从1到n页抓取数据 url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1C0000" cookies = ast.literal_eval(params[0][3]) response = self.get_one_page(url, {}, [], cookies) sleeptime = random.randint(10, 15) print('\033[37;40m-------------------获取MEG发货统计url休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) pattern = re.compile( '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?>\d+月\d+日MEG发货统计</a></li>') # 去掉re.S 不匹配换行符 items = re.findall(pattern, response) for i in range(0, 3): # len(items) url = 'http://www.ccf.com.cn' + items[i] data = re.findall('/newscenter/detail-1C0000-(.*?).shtml', items[i])[0][:8] # 获取日期 data = datetime.datetime.strptime(data, '%Y%m%d') + datetime.timedelta(days=-1) # 日期减少一天 data = data.strftime('%Y%m%d') # 转换为yyyymmdd格式 response = requests.get(url, headers={}, cookies=cookies) sleeptime = random.randint(30, 45) print('\033[37;40m-------------------获取新一条MEG发货统计数据休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) response.encoding = 'gbk' html2 = response pattern2 = re.compile('<div id=newscontent>.*?MEG发货量在(.*?)吨.*?</div>', re.S) items2 = re.findall(pattern2, html2.text)[:2] for item in items2: if item == []: print('\033[1;37;41m----------------------MEG发货量逻辑异常-----------------------\033[0m') else: yield { 'index': 'MEG发货量', 'erdat': data, 'sl': item } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到库存/港口数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 8) counter = counter + 1
def parse_one_page(self, *params): # MEG港口库存详细 counter = 1 while counter <= 2 : try: for j in ({'1'}): # 从1到n页抓取数据 url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1C0000" cookies = ast.literal_eval(params[0][3]) response = self.get_one_page(url, {}, [], cookies) sleeptime = random.randint(10, 15) print('\033[37;40m-------------------获取MEG港口库存url休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) pattern = re.compile( '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?MEG港口库存.*?</a></li>') # 去掉re.S 不匹配换行符 items = re.findall(pattern, response) for i in range(0, 3): # 暂时只取一日数据 len(items) url = 'http://www.ccf.com.cn' + items[i] data = re.findall('/newscenter/detail-1C0000-(.*?).shtml', items[i])[0][:8] # 获取日期 response = requests.get(url, cookies=cookies) sleeptime = random.randint(30, 45) print('\033[37;40m-------------------获取新一条MEG港口库存数据休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) response.encoding = 'gbk' html2 = response pattern2 = re.compile('<p(?:| style="text-indent: 2em;")>(?:CCF讯|受封航影响).*?今日华东主港地区(.*?)约(.*?)万吨.*?</p>') items2 = re.findall(pattern2, html2.text)[0][1] if items2 == []: print('\033[1;37;41m----------------------MEG主港库存逻辑异常-----------------------\033[0m') else: yield { 'index': 'MEG主港库存', 'erdat': data, 'sl': items2 } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到MEG港口库存数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 9) counter = counter + 1
def main(): try: # 定义today为当前日期 ny为年月 today = datetime.date.today().strftime('%Y-%m-%d') ny = datetime.date.today().strftime('%Y-%m') yeday = datetime.datetime.strptime(today, '%Y-%m-%d') + datetime.timedelta(days=-4) yeday = yeday.strftime('%Y-%m-%d') # 设定传递参数,依次为昨日,今日和年月 ccf_cookie = str(get_market_values().ccf_cookie()) ccfei_cookie = str(get_market_values().ccfei_cookie()) icis_cookie = get_market_values().icis_cookie() param = [yeday, today, ny, ccf_cookie, ccfei_cookie, icis_cookie] sqltuple = get_market_values().parse_concurrent(param) # 定义连接,并插入数据 conn = get_connection().hana_connection() ins_sql = '''insert into"COMMON"."XFM_MARKT" (ZB , ERDAT ,SPJ) values ('%s','%s',%s)''' sql().sql_req(conn, tuple(sqltuple), sql=ins_sql) get_connection().close_connection(conn) except Exception as e: print(e) wechat_auto().send_mesg(0, 'person', 13)
def ie_autoload(): ie_option = IOptions() # 设置隐藏模式 ie_option.add_argument('--disable-gpu') ie_option.add_argument('--no-sandbox') ie_option.add_argument('--disable-dev-shm-usage') ie_driver = webdriver.Ie(options=ie_option) # 登陆网页保存cookie try: url1 = 'https://auth.xfmgroup.com/cas/login?' \ 'service=http%3a%2f%2fmes.xfmgroup.com%2fWebUI%2fIPWeb%2f' ie_driver.get(url1) time.sleep(5) ie_driver.find_element_by_xpath('//*[@id="fm1"]/div/div[3]/input[4]').click() time.sleep(5) # 进入需要计算的页面 url2 = 'http://fbbb.xfmgroup.com/FBBB/Integrates/DispatcherIn.aspx?' \ 'funcid=00010013&DATE=%s' % yesterday2 ie_driver.get(url2) # 等待frame/left加载出来 WebDriverWait(ie_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//frame[@name="left"]'))) ie_driver.switch_to.frame('left') time.sleep(1) ie_driver.find_element_by_id('btnCompute').click() # 等待计算按钮重新加载出来 WebDriverWait(ie_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//input[@id="btnCompute"]'))) # 结束 ie_driver.quit() except: # 异常退出 ie_driver.quit() wechat_auto().send_mesg(0, 'person', 5)
def chrome_autoload(): chrome_option = COptions() file_dir = 'D:\\各装置主要生产日指标汇总' # 设置隐藏模式 prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': file_dir} #设置为0,禁止弹出窗口 chrome_option.add_experimental_option('prefs', prefs) chrome_option.add_argument('--headless') chrome_driver = webdriver.Chrome(options=chrome_option) try: # 登陆网页 url2 = 'http://192.168.2.81:8080/BOE/OpenDocument/opendoc/openDocument.jsp?sIDType=CUID' \ '&iDocID=AcoYQKP5WQlNng0bCsVeAcw&lsSERDAT=%s' % yesterday chrome_driver.get(url2) time.sleep(1) chrome_driver.find_element_by_id('_id0:logon:USERNAME').send_keys('BI_USER') chrome_driver.find_element_by_id('_id0:logon:PASSWORD').send_keys('Xfm@2019') chrome_driver.find_element_by_id('_id0:logon:logonButton').click() # 等待切换到opendoc框架,并等待出现webi框架,然后切换到webi框架 WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//iframe[@id="openDocChildFrame"]'))) chrome_driver.switch_to.frame('openDocChildFrame') WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//iframe[@id="webiViewFrame"]'))) chrome_driver.switch_to.frame('webiViewFrame') # 等待直到加载完毕(标志是visibility是否由visible转换为hidden) try: WebDriverWait(chrome_driver, 60).until(EC.presence_of_element_located( (By.XPATH, "//div[@id='modal_waitDlg' and contains(@style,'visibility: hidden')]"))) except Exception as e: print(e, '运行等待超时') # 开始下载文件并改名 try: chrome_driver.find_element_by_id('ariaLabelledBy_alertDlg') chrome_driver.quit() return '未成功计算' except: try: # 等待页面加载完毕可点击,点击 WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//*[@id="_dhtmlLib_270"]'))) chrome_driver.find_element_by_id('_dhtmlLib_270').click() # 等待下载框弹出,点击 WebDriverWait(chrome_driver, 30).until(EC.presence_of_element_located( (By.XPATH, '//*[@id="check_SelectAllReport"]'))) chrome_driver.find_element_by_id('check_SelectAllReport').click() time.sleep(3) chrome_driver.find_element_by_id('check_1783').click() time.sleep(3) chrome_driver.find_element_by_id('BtnCImg_OK_BTN_idExportDlg').click() time.sleep(20) old_dir = file_dir + '\各装置主要生产日指标汇总.pdf' new_dir = file_dir + '\%s 各装置主要生产日指标汇总表.pdf' % yesterday if os.path.exists(new_dir): os.remove(new_dir) time.sleep(1) os.rename(old_dir, new_dir) else: os.rename(old_dir, new_dir) except: chrome_driver.quit() return '未成功计算' # 结束 chrome_driver.quit() return '下载完成' except: # 异常退出 chrome_driver.quit() wechat_auto().send_mesg(0, 'person', 4) return '异常退出'
for table in first_page.extract_tables(): # 这个pdf暂时只有一张表 df = pd.DataFrame(table[5:]) values = df.values # 获取每行值 for i in range(0, len(values)): if re.search(str(values[i]), '#'): # 查找是否出现#字符 wechat_auto().send_mesg(0, 'person', 0) return '出现特殊字符' return '未出现特殊字符' if __name__ == '__main__': ie_autoload() time.sleep(1) response = chrome_autoload() i = 1 # 计算5次,若5次都失败,退出 while response=='未成功计算'and i<=4: print('\033[37;40m---------------这是第%s次调用chrome-----------------\033[0m' % (i+1)) ie_autoload() time.sleep(1) response = chrome_autoload() i = i+1 # 若连续五次计算都失败,发送消息检查代码 if i ==5: wechat_auto().send_mesg(0, 'person', 3) # 调用检测函数 try: question = question_search(yesterday) print(question) except: print('未发现需检查文档')
import os from 接口.setting import wechat_auto,fixed_params yesterday = fixed_params().yesterday filename = ' %s.pdf' % yesterday chfilename = 'D:\各装置主要生产日指标汇总\各装置主要生产日指标汇总表' if os.path.exists(chfilename + filename): # 参数依次是自己,路径,文件中文字符,文件剩余部分 wechat_auto().send_file(namenum=0, filenum=0, filenum2=1, filename=filename) else: # 提醒人员处理,参数依次是自己,文本1 wechat_auto().send_mesg(0, 1)
def parse_one_page(self, *params): counter = 1 while counter <= 2: try: for pagnum in {1}: page = '&pos=%s&act=next&page=%s' % ((pagnum-1)*60,pagnum) url = 'http://www.ometal.com/bin0/new/searchkey_cj.asp?type=%B3%A4%BD%AD%D3%D0%C9%AB%BD%F0%CA%F4%CF%D6%BB%F5&searchtype=&newsort=7'+ page response = self.get_one_page(url, self._headers, [], []) pattern = re.compile( '<tr>[\s\S]*?<td align="left" class="s105">·[\s\S]*?<a href="(.*?)" target="_blank">' '<span style="color:black;background-color:yellow">长江有色金属现货</span>.*?' ,re.S) # [\s\S]*?匹配包含换行的任意字符 items = re.findall(pattern, response) for i in range(0, 5): # 只爬取前五个页面 time.sleep(0.5) url2 = 'http://www.ometal.com' + items[i] response2 = self.get_one_page(url2, self._headers, [], []) # html找不到钴或者电解锰,跳出本次循环 if response2.find('钴') == -1 or response2.find('电解锰') == -1 or response2.find('均价') == -1: continue html = etree.HTML(response2) # 处理网页,方便正则匹配 response2 = response2.replace(' ',' ') # 时间格式转换 day_ori= items[i][9:items[i].find('marketnew') - 1] day = datetime.datetime.strptime(day_ori, '%Y/%m/%d').strftime('%Y%m%d') # 爬取名称 key_values = {'1#钴': 11, '1#电解锰': 15} for j in (11, 15): if day > '20190110' and response2.find('吨') != -1: # 2019年1月10日前并且含有单位吨按如下格式爬取,否则进入else循环 try: name_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/text() | //*[@id="fontzoom"]/div/div/table/tbody/tr[%s]/td[1]/p/text() |' \ '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/span[2]/text() | //div[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/text() |' \ '//div[@id="fontzoom"]/div/table/tbody/tr[%s]/td[1]/p/text() | //*[@id="fontzoom"]/table/tbody/tr[%s]/td[1]/p/strong/span/text()' % ( j, j, j, j, j, j) sl_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/text() | //*[@id="fontzoom"]/div/div/table/tbody/tr[%s]/td[5]/p/text() |' \ '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/span/text() | //div[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/text() |' \ '//div[@id="fontzoom"]/div/table/tbody/tr[%s]/td[5]/p/text() | //*[@id="fontzoom"]/table/tbody/tr[%s]/td[5]/p/strong/span/text()' % ( j, j, j, j, j, j) name = html.xpath(name_str)[0] sl = str(html.xpath(sl_str)[0]).replace(',', '') yield { 'index': '1#钴' if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } except: name = list(key_values.keys())[list(key_values.values()).index(j)] pattern = re.compile( '''%s[\s\S]*?(?:.*?(?:mso-spacerun: 'yes'; mso-font-kerning: 0.0000pt"|mso-font-kerning: 0.0000pt; mso-spacerun: 'yes';")>){3}''' '''(.*?)[</span>]{1}''' % name , re.S) # [\s\S]*?匹配包含换行的任意字符 sl = str(re.findall(pattern, response2)[0]).replace(',', '') yield { 'index': '1#钴'if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } elif day > '20190110' and response2.find('吨') == -1: sl_str = '//div[@id="fontzoom"]/table/tbody/tr[%s]/td[4]/text()' % j sl = str(html.xpath(sl_str)[0]).replace(',', '') yield { 'index': '1#钴' if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } else: try: name = list(key_values.keys())[list(key_values.values()).index(j)] pattern = re.compile( '''%s</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td>''' % name , re.S) sl = str(re.findall(pattern, response2)[0][2]).replace(',', '') yield { 'index': '1#钴'if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } except: try: if j == 11: name_str2 = '1# 钴' else: name_str2 = '1# 电解锰' pattern = re.compile('%s(?:</p></td><td.*?><p.*?>(.*?)){4}</p>' % name_str2, re.S) sl = str(re.findall(pattern, response2)[0]).replace(',', '') yield { 'index': '1#钴'if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } except: try: if j == 11: name_str2 = '1# 钴' else: name_str2 = '1# 电解锰' pattern = re.compile('%s(?:</td><td.*?>(.*?)){4}</td>' % name_str2, re.S) sl = str(re.findall(pattern, response2)[0]).replace(',', '') yield { 'index': '1#钴'if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } except: if j == 11: name_str2 = '钴' else: name_str2 = '电解锰' pattern = re.compile('%s</span></p>(?:</td><td.*?><p>(.*?)</p>){4}' % name_str2, re.S) sl = str(re.findall(pattern, response2)[0]).replace(',', '') yield { 'index': '1#钴'if j == 11 else '1#电解锰', 'erdat': day, 'sl': int(sl) } break except: if counter == 2: print('\033[1;37;41m----------------------未取到钴与电解锰价格数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 11) counter = counter + 1
def parse_one_page(self, *params): counter = 1 while counter <= 2: try: for j in ({'1'}): # 从1到n页抓取数据 url = 'http://www.ccf.com.cn/newscenter/index.php?cur_row_pos=0&cur_pg_num=' + j + "&Class_ID=1B0000" cookies = ast.literal_eval(params[0][3]) response = self.get_one_page(url, self._headers, [], cookies) sleeptime = random.randint(5, 10) print('\033[37;40m-------------------获取轻纺城网页url休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) pattern = re.compile( '<li(?: | class=articlebreak )><span>.*?href="(.*?)" class="h1a2" target="_blank" onmouseover="return overlib.*?</a></li>', re.S) items = re.findall(pattern, response) # 获取页面的url信息 for i in range(0, 3): # len(items) url = 'http://www.ccf.com.cn' + items[i] data = re.findall('/newscenter/detail-1B0000-(.*?).shtml', items[i])[0][:8] # 获取日期 response = requests.get(url, [], cookies=cookies) sleeptime = random.randint(15, 30) print('\033[37;40m-------------------获取新一条轻纺城数据休眠', sleeptime, '秒------------------\033[0m') time.sleep(sleeptime) response.encoding = 'gbk' html2 = etree.HTML(response.text) if data >= '20171020': j = 2 while j <= 3: a = '//tbody/tr[%s]/td[1]/text()' % j b = '//tbody/tr[%s]/td[2]/text()' % j zxl = html2.xpath(a)[0] sl = html2.xpath(b)[0] yield { 'index': str(zxl), 'erdat': str(data), 'sl': int(sl) } j = j + 1 elif data >= '20170117': j = 3 while j <= 4: a = '//tbody/tr[%s]/td[1]/text()' % j b = '//tbody/tr[%s]/td[2]/text()' % j zxl = html2.xpath(a)[0] sl = html2.xpath(b)[0] yield { 'index': str(zxl), 'erdat': str(data), 'sl': int(sl) } j = j + 1 else: a1 = 'normalize-space(//tbody/tr[3]/td[1]/p/text())' b1 = '//tbody/tr[3]/td[2]/text()' zxl = html2.xpath(a1) sl = html2.xpath(b1)[0] yield { 'index': str(zxl)[-3:], 'erdat': str(data), 'sl': int(sl) } a2 = '//tbody/tr[4]/td[1]/text()' b2 = '//tbody/tr[4]/td[2]/text()' zxl = html2.xpath(a2)[0] sl = html2.xpath(b2)[0] yield { 'index': str(zxl), 'erdat': str(data), 'sl': int(sl) } zxl = html2.xpath('//tbody/tr[4]/td[1]/text()')[0] sl = html2.xpath('//tbody/tr[4]/td[2]/text()')[0] yield { 'index': str(zxl), 'erdat': str(data), 'sl': int(sl) } break except: if counter == 2 : print('\033[1;37;41m----------------------未取到轻纺城数据-----------------------\033[0m') wechat_auto().send_mesg(0, 'person', 7) counter = counter + 1